diff --git "a/gen.py" "b/gen.py"
new file mode 100644--- /dev/null
+++ "b/gen.py"
@@ -0,0 +1,3821 @@
+import ast
+import copy
+import functools
+import inspect
+import queue
+import sys
+import os
+import time
+import traceback
+import typing
+import warnings
+from datetime import datetime
+import requests
+from requests import ConnectTimeout, JSONDecodeError
+from urllib3.exceptions import ConnectTimeoutError, MaxRetryError, ConnectionError
+from requests.exceptions import ConnectionError as ConnectionError2
+from requests.exceptions import ReadTimeout as ReadTimeout2
+
+if os.path.dirname(os.path.abspath(__file__)) not in sys.path:
+    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+os.environ['BITSANDBYTES_NOWELCOME'] = '1'
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+
+# more is not useful typically, don't let these go beyond limits and eat up resources
+max_cores = max(1, os.cpu_count() // 2)
+if os.getenv('NUMEXPR_MAX_THREADS') is None:
+    os.environ['NUMEXPR_MAX_THREADS'] = str(min(8, max_cores))
+if os.getenv('NUMEXPR_NUM_THREADS') is None:
+    os.environ['NUMEXPR_NUM_THREADS'] = str(min(8, max_cores))
+if os.getenv('OMP_NUM_THREADS') is None:
+    os.environ['OMP_NUM_THREADS'] = str(min(8, max_cores))
+if os.getenv('OPENBLAS_NUM_THREADS') is None:
+    os.environ['OPENBLAS_NUM_THREADS'] = str(min(8, max_cores))
+if os.getenv('DUCKDB_NUM_THREADS') is None:
+    os.environ['DUCKDB_NUM_THREADS'] = str(min(4, max_cores))
+if os.getenv('RAYON_RS_NUM_CPUS') is None:
+    os.environ['RAYON_RS_NUM_CPUS'] = str(min(8, max_cores))
+if os.getenv('RAYON_NUM_THREADS') is None:
+    os.environ['RAYON_NUM_THREADS'] = str(min(8, max_cores))
+
+import numpy as np
+from evaluate_params import eval_func_param_names, no_default_param_names, input_args_list
+from enums import DocumentSubset, LangChainMode, no_lora_str, model_token_mapping, no_model_str, \
+    LangChainAction, LangChainAgent, DocumentChoice, LangChainTypes, super_source_prefix, \
+    super_source_postfix, t5_type, get_langchain_prompts, gr_to_lg, invalid_key_msg
+from loaders import get_loaders
+from utils import set_seed, clear_torch_cache, NullContext, wrapped_partial, EThread, get_githash, \
+    import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, FakeTokenizer, \
+    have_langchain, set_openai, cuda_vis_check, H2O_Fire, lg_to_gr, str_to_list, str_to_dict, get_token_count
+
+start_faulthandler()
+import_matplotlib()
+
+SEED = 1236
+set_seed(SEED)
+
+from typing import Union
+
+import torch
+from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
+
+from prompter import Prompter, inv_prompt_type_to_model_lower, non_hf_types, PromptType, get_prompt, generate_prompt
+from stopping import get_stopping
+
+langchain_actions = [x.value for x in list(LangChainAction)]
+
+langchain_agents_list = [x.value for x in list(LangChainAgent)]
+
+
+def main(
+        load_8bit: bool = False,
+        load_4bit: bool = False,
+        low_bit_mode: int = 1,
+        load_half: bool = None,
+        load_gptq: str = '',
+        load_exllama: bool = False,
+        use_safetensors: bool = False,
+        revision: str = None,
+        use_gpu_id: bool = True,
+        base_model: str = '',
+        tokenizer_base_model: str = '',
+        lora_weights: str = "",
+        gpu_id: int = 0,
+        compile_model: bool = None,
+        use_cache: bool = None,
+        inference_server: str = "",
+        prompt_type: Union[int, str] = None,
+        prompt_dict: typing.Dict = None,
+        system_prompt: str = '',
+
+        # llama and gpt4all settings
+        llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=0),
+        model_path_llama: str = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q8_0.bin',
+        # 'llama-2-7b-chat.ggmlv3.q8_0.bin',
+        model_name_gptj: str = 'ggml-gpt4all-j-v1.3-groovy.bin',
+        model_name_gpt4all_llama: str = 'ggml-wizardLM-7B.q4_2.bin',
+        model_name_exllama_if_no_config: str = 'TheBloke/Nous-Hermes-Llama2-GPTQ',
+
+        model_lock: typing.List[typing.Dict[str, str]] = None,
+        model_lock_columns: int = None,
+        fail_if_cannot_connect: bool = False,
+
+        # input to generation
+        temperature: float = None,
+        top_p: float = None,
+        top_k: int = None,
+        num_beams: int = None,
+        repetition_penalty: float = None,
+        num_return_sequences: int = None,
+        do_sample: bool = None,
+        max_new_tokens: int = None,
+        min_new_tokens: int = None,
+        early_stopping: Union[bool, str] = None,
+        max_time: float = None,
+
+        memory_restriction_level: int = None,
+        debug: bool = False,
+        save_dir: str = None,
+        share: bool = False,
+        local_files_only: bool = False,
+        resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,
+        trust_remote_code: Union[str, bool] = True,
+        rope_scaling: dict = None,
+        max_seq_len: int = None,
+        offload_folder: str = "offline_folder",
+
+        src_lang: str = "English",
+        tgt_lang: str = "Russian",
+
+        prepare_offline_level: int = 0,
+        cli: bool = False,
+        cli_loop: bool = True,
+        gradio: bool = True,
+        gradio_offline_level: int = 0,
+        server_name: str = "0.0.0.0",
+        root_path: str = "",
+        chat: bool = True,
+        chat_conversation: typing.List[typing.Tuple[str, str]] = None,
+        text_context_list: typing.List[str] = None,
+        stream_output: bool = True,
+        async_output: bool = True,
+        num_async: int = 3,
+        show_examples: bool = None,
+        verbose: bool = False,
+        h2ocolors: bool = True,
+        dark: bool = False,  # light tends to be best
+        height: int = 600,
+        show_lora: bool = True,
+        show_llama: bool = True,
+        show_gpt4all: bool = False,
+        login_mode_if_model0: bool = False,
+        block_gradio_exit: bool = True,
+        concurrency_count: int = 1,
+        api_open: bool = False,
+        allow_api: bool = True,
+        input_lines: int = 1,
+        gradio_size: str = None,
+        show_copy_button: bool = True,
+        large_file_count_mode: bool = False,
+        pre_load_embedding_model: bool = True,
+
+        auth: Union[typing.List[typing.Tuple[str, str]], str] = None,
+        auth_filename: str = None,
+        auth_access: str = 'open',
+        auth_freeze: bool = False,
+        auth_message: str = None,
+        guest_name: str = "guest",
+        enforce_h2ogpt_api_key: bool = None,
+        h2ogpt_api_keys: Union[list, str] = [],
+        h2ogpt_key: str = None,
+
+        max_max_time=None,
+        max_max_new_tokens=None,
+
+        visible_models: list = None,
+        visible_visible_models: bool = True,
+        visible_submit_buttons: bool = True,
+        visible_side_bar: bool = True,
+        visible_doc_track: bool = True,
+        visible_chat_tab: bool = True,
+        visible_doc_selection_tab: bool = True,
+        visible_doc_view_tab: bool = True,
+        visible_chat_history_tab: bool = True,
+        visible_expert_tab: bool = True,
+        visible_models_tab: bool = True,
+        visible_system_tab: bool = True,
+        visible_tos_tab: bool = False,
+        visible_login_tab: bool = True,
+        visible_hosts_tab: bool = False,
+        chat_tables: bool = False,
+        visible_h2ogpt_header: bool = True,
+        max_raw_chunks: int = None,
+
+        sanitize_user_prompt: bool = False,
+        sanitize_bot_response: bool = False,
+
+        extra_model_options: typing.List[str] = [],
+        extra_lora_options: typing.List[str] = [],
+        extra_server_options: typing.List[str] = [],
+
+        score_model: str = 'auto',
+
+        eval_filename: str = None,
+        eval_prompts_only_num: int = 0,
+        eval_prompts_only_seed: int = 1234,
+        eval_as_output: bool = False,
+
+        langchain_mode: str = None,
+        user_path: str = None,
+        langchain_modes: list = [LangChainMode.USER_DATA.value, LangChainMode.MY_DATA.value, LangChainMode.LLM.value,
+                                 LangChainMode.DISABLED.value],
+        langchain_mode_paths: dict = {LangChainMode.USER_DATA.value: None},
+        langchain_mode_types: dict = {LangChainMode.USER_DATA.value: LangChainTypes.SHARED.value},
+        detect_user_path_changes_every_query: bool = False,
+
+        langchain_action: str = LangChainAction.QUERY.value,
+        langchain_agents: list = [],
+        force_langchain_evaluate: bool = False,
+
+        visible_langchain_actions: list = [LangChainAction.QUERY.value, LangChainAction.SUMMARIZE_MAP.value],
+        visible_langchain_agents: list = langchain_agents_list.copy(),
+
+        document_subset: str = DocumentSubset.Relevant.name,
+        document_choice: list = [DocumentChoice.ALL.value],
+
+        use_llm_if_no_docs: bool = True,
+        load_db_if_exists: bool = True,
+        keep_sources_in_context: bool = False,
+        db_type: str = 'chroma',
+        use_openai_embedding: bool = False,
+        use_openai_model: bool = False,
+        hf_embedding_model: str = None,
+        migrate_embedding_model: str = False,
+        auto_migrate_db: bool = False,
+        cut_distance: float = 1.64,
+        answer_with_sources: bool = True,
+        append_sources_to_answer: bool = True,
+        show_accordions: bool = True,
+        top_k_docs_max_show: int = 10,
+        show_link_in_sources: bool = True,
+        pre_prompt_query: str = None,
+        prompt_query: str = None,
+        pre_prompt_summary: str = None,
+        prompt_summary: str = None,
+        add_chat_history_to_context: bool = True,
+        add_search_to_context: bool = False,
+        context: str = '',
+        iinput: str = '',
+        allow_upload_to_user_data: bool = True,
+        reload_langchain_state: bool = True,
+        allow_upload_to_my_data: bool = True,
+        enable_url_upload: bool = True,
+        enable_text_upload: bool = True,
+        enable_sources_list: bool = True,
+        chunk: bool = True,
+        chunk_size: int = 512,
+        top_k_docs: int = None,
+        docs_ordering_type: str = 'reverse_ucurve_sort',
+        min_max_new_tokens=256,
+        auto_reduce_chunks: bool = True,
+        max_chunks: int = 100,
+        headsize: int = 50,
+        n_jobs: int = -1,
+
+        # urls
+        use_unstructured=True,
+        use_playwright=False,
+        use_selenium=False,
+
+        # pdfs
+        use_pymupdf='auto',
+        use_unstructured_pdf='auto',
+        use_pypdf='auto',
+        enable_pdf_ocr='auto',
+        enable_pdf_doctr='auto',
+        try_pdf_as_html='auto',
+
+        # images
+        enable_ocr=False,
+        enable_doctr=False,
+        enable_pix2struct=False,
+        enable_captions=True,
+
+        pre_load_caption_model: bool = False,
+        caption_gpu: bool = True,
+        captions_model: str = "Salesforce/blip-image-captioning-base",
+        doctr_gpu: bool = True,
+
+        # json
+        jq_schema='.[]',
+
+        max_quality: bool = False,
+
+        enable_heap_analytics: bool = True,
+        heap_app_id: str = "1680123994",
+):
+    """
+
+    :param load_8bit: load model in 8-bit using bitsandbytes
+    :param load_4bit: load model in 4-bit using bitsandbytes
+    :param low_bit_mode: 0: no quantization config 1: change compute 2: nf4 3: double quant 4: 2 and 3
+           See: https://huggingface.co/docs/transformers/main_classes/quantization
+           If using older bitsandbytes or transformers, 0 is required
+    :param load_half: load model in float16 (None means auto, which means True unless t5 based model)
+                      otherwise specify bool
+    :param load_gptq: to load model with GPTQ, put model_basename here, e.g. gptq_model-4bit--1g
+    :param load_exllama: whether to use exllama (only applicable to LLaMa1/2 models with 16-bit or GPTQ
+    :param use_safetensors: to use safetensors version (assumes file/HF points to safe tensors version)
+    :param revision: Which HF revision to use
+    :param use_gpu_id: whether to control devices with gpu_id.  If False, then spread across GPUs
+    :param base_model: model HF-type name.  If use --base_model to preload model, cannot unload in gradio in models tab
+    :param tokenizer_base_model: tokenizer HF-type name.  Usually not required, inferred from base_model.
+    :param lora_weights: LORA weights path/HF link
+    :param gpu_id: if use_gpu_id, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
+    :param compile_model Whether to compile the model
+    :param use_cache: Whether to use caching in model (some models fail when multiple threads use)
+    :param inference_server: Consume base_model as type of model at this address
+                             Address can be text-generation-server hosting that base_model
+                             e.g. python generate.py --inference_server="http://192.168.1.46:6112" --base_model=h2oai/h2ogpt-oasst1-512-12b
+
+                             Or Address can be "openai_chat" or "openai" for OpenAI API
+                             Or Address can be "openai_azure_chat" or "openai_azure" for Azure OpenAI API
+                             e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo
+                             e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003
+                             e.g. python generate.py --inference_server="openai_azure_chat:<deployment_name>:<baseurl>:<api_version>:<model_version>" --base_model=gpt-3.5-turbo
+                             e.g. python generate.py --inference_server="openai_azure:<deployment_name>:<baseurl>:<api_version>:<model_version>" --base_model=text-davinci-003
+                             Optionals (Replace with None or just leave empty but keep :)
+                                 <deployment_name> of some deployment name
+                                 <baseurl>: e.g. "<endpoint>.openai.azure.com" for some <endpoint> without https://
+                                 <api_version> of some api, e.g. 2023-05-15
+                                 <model_version> e.g. 0613
+
+                             Or Address can be for vLLM:
+                              Use: "vllm:IP:port" for OpenAI-compliant vLLM endpoint
+                              Note: vllm_chat not supported by vLLM project.
+
+                             Or Address can be replicate:
+                             Use:
+                              --inference_server=replicate:<model name string> will use a Replicate server, requiring a Replicate key.
+                              e.g. <model name string> looks like "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5"
+
+                             Or Address can be for AWS SageMaker:
+                              Use: "sagemaker_chat:<endpoint name>" for chat models that AWS sets up as dialog
+                              Use: "sagemaker:<endpoint name>" for foundation models that AWS only text as inputs
+
+    :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
+    :param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True)
+    :param system_prompt: Universal system prompt to use if model supports, like LLaMa2, regardless of prompt_type definition.
+           Useful for langchain case to control behavior, or OpenAI and Replicate.
+           If None, 'None', or 'auto', then for LLaMa or other models that internally have system_prompt, will use default for each model
+           If '', then no system prompt (no empty template given to model either, just no system part added at all)
+           If some string not in ['None', 'auto'], then use that as system prompt
+           Default is '', no system_prompt, because often it hurts performance/accuracy
+
+    :param llamacpp_dict:
+           n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
+           use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
+           n_batch: Can make smaller to 128 for slower low-memory CPU systems
+           n_gqa: Required to be 8 for LLaMa 70B
+           ... etc. anything that could be passed to llama.cpp or GPT4All models
+           e.g. python generate.py --base_model='llama' --prompt_type=llama2 --score_model=None --langchain_mode='UserData' --user_path=user_path --llamacpp_dict="{'n_gpu_layers':25,'n_batch':128}"
+    :param model_path_llama: model path or URL (for auto-download)
+    :param model_name_gptj: model path or URL (for auto-download)
+    :param model_name_gpt4all_llama: model path or URL (for auto-download)
+    :param model_name_exllama_if_no_config: exllama model's full path for model, tokenizer, generator for use when no HuggingFace config
+
+    :param model_lock: Lock models to specific combinations, for ease of use and extending to many models
+           Only used if gradio = True
+           List of dicts, each dict has base_model, tokenizer_base_model, lora_weights, inference_server, prompt_type, and prompt_dict
+           If all models have same prompt_type, and prompt_dict, can still specify that once in CLI outside model_lock as default for dict
+           Can specify model_lock instead of those items on CLI
+           As with CLI itself, base_model can infer prompt_type and prompt_dict if in prompter.py.
+             Also, tokenizer_base_model and lora_weights are optional.
+             Also, inference_server is optional if loading model from local system.
+           All models provided will automatically appear in compare model mode
+           Model loading-unloading and related choices will be disabled.  Model/lora/server adding will be disabled
+    :param model_lock_columns: How many columns to show if locking models (and so showing all at once)
+           If None, then defaults to up to 3
+           if -1, then all goes into 1 row
+           Maximum value is 4 due to non-dynamic gradio rendering elements
+    :param fail_if_cannot_connect: if doing model locking (e.g. with many models), fail if True.  Otherwise ignore.
+           Useful when many endpoints and want to just see what works, but still have to wait for timeout.
+
+    :param temperature: generation temperature
+    :param top_p: generation top_p
+    :param top_k: generation top_k
+    :param num_beams: generation number of beams
+    :param repetition_penalty: generation repetition penalty
+    :param num_return_sequences: generation number of sequences (1 forced for chat)
+    :param do_sample: generation sample
+    :param max_new_tokens: generation max new tokens
+    :param min_new_tokens: generation min tokens
+    :param early_stopping: generation early stopping
+    :param max_time: maximum time to allow for generation
+    :param memory_restriction_level: 0 = no restriction to tokens or model, 1 = some restrictions on token 2 = HF like restriction 3 = very low memory case
+    :param debug: enable debug mode
+    :param save_dir: directory chat data is saved to
+    :param share: whether to share the gradio app with sharable URL
+    :param local_files_only: whether to only use local files instead of doing to HF for models
+    :param resume_download: whether to resume downloads from HF for models
+    :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
+    :param trust_remote_code: whether to use trust any code needed for HF model
+    :param rope_scaling:
+           For HF transformers model: scaling for rope-based models, e.g. --rope_scaling="{'type':'dynamic', 'factor':4}"
+           For exllama model: --rope_scaling="{'alpha_value':4}" .  This automatically scales max_seq_len for exllama
+    :param max_seq_len: Manually set maximum sequence length for the LLM
+    :param offload_folder: path for spilling model onto disk
+    :param src_lang: source languages to include if doing translation (None = all)
+    :param tgt_lang: target languages to include if doing translation (None = all)
+
+    :param prepare_offline_level:
+           Whether to just prepare for offline use, do not go into cli, eval, or gradio run modes
+           0 : no prep
+           1: prepare just h2oGPT with exact same setup as passed to CLI and ensure all artifacts for h2oGPT alone added to ~/.cache/
+           2: prepare h2oGPT + all inference servers so h2oGPT+inference servers can use the ~/.cache/
+    :param cli: whether to use CLI (non-gradio) interface.
+    :param cli_loop: whether to loop for CLI (False usually only for testing)
+    :param gradio: whether to enable gradio, or to enable benchmark mode
+    :param gradio_offline_level: > 0, then change fonts so full offline
+           == 1 means backend won't need internet for fonts, but front-end UI might if font not cached
+           == 2 means backend and frontend don't need internet to download any fonts.
+           Note: Some things always disabled include HF telemetry, gradio telemetry, chromadb posthog that involve uploading.
+           This option further disables google fonts for downloading, which is less intrusive than uploading,
+           but still required in air-gapped case.  The fonts don't look as nice as google fonts, but ensure full offline behavior.
+           Also set --share=False to avoid sharing a gradio live link.
+    :param server_name: IP to use.  In linux 0.0.0.0 is good choice so exposed to outside host, else for only local use 127.0.0.1.
+                        For windows/MAC 0.0.0.0 or 127.0.0.1 will work, but may need to specify actual LAN IP address for other LAN clients to see.
+    :param root_path: The root path (or "mount point") of the application,
+           if it's not served from the root ("/") of the domain. Often used when the application is behind a reverse proxy
+           that forwards requests to the application. For example, if the application is served at "https://example.com/myapp",
+           the `root_path` should be set to "/myapp".
+    :param chat: whether to enable chat mode with chat history
+    :param chat_conversation: list of tuples of (human, bot) conversation pre-appended to existing chat when using instruct/chat models
+           Requires also add_chat_history_to_context = True
+           It does *not* require chat=True, so works with nochat_api etc.
+    :param text_context_list: List of strings to add to context for non-database version of document Q/A for faster handling via API etc.
+           Forces LangChain code path and uses as many entries in list as possible given max_seq_len, with first assumed to be most relevant and to go near prompt.
+    :param stream_output: whether to stream output
+    :param async_output: Whether to do asyncio handling
+           For summarization
+           Applicable to HF TGI server
+           Only if stream_output=False in CLI, UI, or API
+    :param num_async: Number of simultaneously allowed asyncio calls to make for async_output
+           Too many will overload inference server, too few will be too slow
+    :param show_examples: whether to show clickable examples in gradio
+    :param verbose: whether to show verbose prints
+    :param h2ocolors: whether to use H2O.ai theme
+    :param dark: whether to use dark mode for UI by default (still controlled in UI)
+    :param height: height of chat window
+    :param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
+    :param show_llama: whether to show LLaMa.cpp/GPT4All options in UI (only likely useful if have weak GPUs)
+    :param show_gpt4all: whether to show GPT4All models in UI (not often useful, llama.cpp models best)
+    :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
+    :param block_gradio_exit: whether to block gradio exit (used for testing)
+    :param concurrency_count: gradio concurrency count (1 is optimal for LLMs)
+    :param api_open: If False, don't let API calls skip gradio queue
+    :param allow_api: whether to allow API calls at all to gradio server
+    :param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
+    :param gradio_size: Overall size of text and spaces: "xsmall", "small", "medium", "large".
+           Small useful for many chatbots in model_lock mode
+    :param show_copy_button: Whether to show copy button for chatbots
+    :param large_file_count_mode: Whether to force manual update to UI of drop-downs, good idea if millions of chunks or documents
+    :param pre_load_embedding_model: Whether to preload embedding model for shared use across DBs and users (multi-thread safe only)
+
+    :param auth: gradio auth for launcher in form [(user1, pass1), (user2, pass2), ...]
+                 e.g. --auth=[('jon','password')] with no spaces
+                 e.g. --auth="[('jon', 'password)())(')]" so any special characters can be used
+                 e.g. --auth=auth.json to specify persisted state file with name auth.json (auth_filename then not required)
+                 e.g. --auth='' will use default auth.json as file name for persisted state file (auth_filename then not required)
+                 e.g. --auth=None will use no auth, but still keep track of auth state, just not from logins
+    :param auth_filename:
+         Set auth filename, used only if --auth= was passed list of user/passwords
+    :param auth_access:
+         'open': Allow new users to be added
+         'closed': Stick to existing users
+    :param auth_freeze: whether freeze authentication based upon current file, no longer update file
+    :param auth_message: Message to show if having users login, fixed if passed, else dynamic internally
+    :param guest_name: guess name if using auth and have open access.
+           If '', then no guest allowed even if open access, then all databases for each user always persisted
+    :param enforce_h2ogpt_api_key: Whether to enforce h2oGPT token usage for API
+    :param h2ogpt_api_keys: list of tokens allowed for API access or file accessed on demand for json of list of keys
+    :param h2ogpt_key: E.g. can be set when accessing gradio h2oGPT server from local gradio h2oGPT server that acts as client to that inference server
+
+    :param max_max_time: Maximum max_time for gradio slider
+    :param max_max_new_tokens: Maximum max_new_tokens for gradio slider
+    :param min_max_new_tokens: Minimum of max_new_tokens, when auto-scaling down to handle more docs/prompt, but still let generation have some tokens
+
+    :param visible_models: Which models in model_lock list to show by default
+           Takes integers of position in model_lock (model_states) list or strings of base_model names
+           Ignored if model_lock not used
+           For nochat API, this is single item within a list for model by name or by index in model_lock
+                                If None, then just use first model in model_lock list
+                                If model_lock not set, use model selected by CLI --base_model etc.
+
+    :param visible_visible_models: Whether visible models drop-down is visible in UI
+    :param visible_submit_buttons: whether submit buttons are visible when UI first comes up
+    :param visible_side_bar: whether left side bar is visible when UI first comes up
+    :param visible_doc_track: whether left side bar's document tracking is visible when UI first comes up
+    :param visible_chat_tab: "" for chat tab
+    :param visible_doc_selection_tab:  "" for doc selection tab
+    :param visible_doc_view_tab: "" for doc view tab
+    :param visible_chat_history_tab: "" for chat history tab
+    :param visible_expert_tab: "" for expert tab
+    :param visible_models_tab: "" for models tab
+    :param visible_system_tab: "" for system tab
+    :param visible_tos_tab: "" for ToS tab
+    :param visible_login_tab: "" for Login tab
+    :param visible_hosts_tab: "" for hosts tab
+    :param chat_tables: Just show Chat as block without tab (useful if want only chat view)
+    :param visible_h2ogpt_header: Whether github stars, URL, logo, and QR code are visible
+    :param max_raw_chunks: Maximum number of chunks to show in UI when asking for raw DB text from documents/collection
+
+    :param sanitize_user_prompt: whether to remove profanity from user input (slows down input processing)
+      Requires optional packages:
+      pip install alt-profanity-check==1.2.2 better-profanity==0.7.0
+    :param sanitize_bot_response: whether to remove profanity and repeat lines from bot output (about 2x slower generation for long streaming cases due to better_profanity being slow)
+    :param extra_model_options: extra models to show in list in gradio
+    :param extra_lora_options: extra LORA to show in list in gradio
+    :param extra_server_options: extra servers to show in list in gradio
+    :param score_model: which model to score responses
+           None: no response scoring
+           'auto': auto mode, '' (no model) for CPU or 1 GPU, 'OpenAssistant/reward-model-deberta-v3-large-v2' for >=2 GPUs,
+            because on CPU takes too much compute just for scoring response
+    :param eval_filename: json file to use for evaluation, if None is sharegpt
+    :param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples
+    :param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling
+    :param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself
+
+    :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
+           None: auto mode, check if langchain package exists, at least do LLM if so, else Disabled
+           If not passed, then chosen to be first langchain_modes, else langchain_mode->Disabled is set if no langchain_modes either
+           WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
+    :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode.
+           If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources
+    :param langchain_modes: dbs to generate at launch to be ready for LLM
+           Apart from additional user-defined collections, can include ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
+             But wiki_full is expensive and requires preparation
+           To allow personal space only live in session, add 'MyData' to list
+           Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
+           If have own user modes, need to add these here or add in UI.
+    :param langchain_mode_paths: dict of langchain_mode keys and disk path values to use for source of documents
+           E.g. "{'UserData2': 'userpath2'}"
+           A disk path be None, e.g. --langchain_mode_paths="{'UserData2': None}" even if existing DB, to avoid new documents being added from that path, source links that are on disk still work.
+           If `--user_path` was passed, that path is used for 'UserData' instead of the value in this dict
+    :param langchain_mode_types: dict of langchain_mode keys and database types
+           E.g. python generate.py --base_model=llama --langchain_modes=['TestData'] --langchain_mode_types="{'TestData':'shared'}"
+           The type is attempted to be inferred if directory already exists, then don't have to pass this
+    :param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes).
+           Expensive for large number of files, so not done by default.  By default only detect changes during db loading.
+
+    :param langchain_action: Mode langchain operations in on documents.
+            Query: Make query of document(s)
+            Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce
+            Summarize_all: Summarize document(s) using entire document at once
+            Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary
+    :param langchain_agents: Which agents to use
+            'search': Use Web Search as context for LLM response, e.g. SERP if have SERPAPI_API_KEY in env
+    :param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing.
+
+    :param visible_langchain_actions: Which actions to allow
+    :param visible_langchain_agents: Which agents to allow
+
+    :param document_subset: Default document choice when taking subset of collection
+    :param document_choice: Chosen document(s) by internal name, 'All' means use all docs
+
+    :param use_llm_if_no_docs: Whether to use LLM even if no documents, when langchain_mode=UserData or MyData or custom
+    :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
+    :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
+    :param db_type: 'faiss' for in-memory
+                    'chroma' (for chroma >= 0.4)
+                    'chroma_old' (for chroma < 0.4) -- recommended for large collections
+                    'weaviate' for persisted on disk
+    :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
+    :param use_openai_model: Whether to use OpenAI model for use with vector db
+    :param hf_embedding_model: Which HF embedding model to use for vector db
+           Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v2 if no GPUs
+           Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2"
+           Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl'
+           We support automatically changing of embeddings for chroma, with a backup of db made if this is done
+    :param migrate_embedding_model: whether to use hf_embedding_model embedding even if database already had an embedding set.
+           used to migrate all embeddings to a new one, but will take time to re-embed.
+           Default (False) is to use the prior embedding for existing databases, and only use hf_embedding_model for new databases
+           If had old database without embedding saved, then hf_embedding_model is also used.
+    :param auto_migrate_db: whether to automatically migrate any chroma<0.4 database from duckdb -> sqlite version
+    :param cut_distance: Distance to cut off references with larger distances when showing references.
+           1.64 is good to avoid dropping references for all-MiniLM-L6-v2, but instructor-large will always show excessive references.
+           For all-MiniLM-L6-v2, a value of 1.5 can push out even more references, or a large value of 100 can avoid any loss of references.
+    :param answer_with_sources: Whether to determine (and return) sources
+    :param append_sources_to_answer: Whether to place source information in chat response (ignored by LLM).  Always disabled for API.
+    :param show_accordions: whether to show accordion for document references in chatbot UI
+    :param top_k_docs_max_show: Max number of docs to show in UI for sources
+           If web search is enabled, then this is modified to be max(top_k_docs_max_show, number of links used in search)
+    :param show_link_in_sources: Whether to show URL link to source document in references
+    :param pre_prompt_query: prompt before documents to query, if None then use internal defaults
+    :param prompt_query: prompt after documents to query, if None then use internal defaults
+    :param pre_prompt_summary: prompt before documents to summarize, if None then use internal defaults
+    :param prompt_summary: prompt after documents to summarize, if None then use internal defaults
+           For summarize, normal to have empty query (nothing added in ask anything in UI or empty string in API)
+           If pass query, template is "Focusing on %s, %s" % (query, prompt_summary)
+           If pass query and iinput, template is "Focusing on %s, %s, %s" % (query, iinput, prompt_summary)
+    :param add_chat_history_to_context: Include chat context when performing action
+           Not supported yet for openai_chat when using document collection instead of LLM
+           Also not supported when using CLI mode
+    :param add_search_to_context: Include web search in context as augmented prompt
+    :param context: Default context to use (for system pre-context in gradio UI)
+           context comes before chat_conversation and any document Q/A from text_context_list
+    :param iinput: Default input for instruction-based prompts
+    :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db (UserData or custom user dbs)
+           Ensure pass user_path for the files uploaded to be moved to this location for linking.
+    :param reload_langchain_state: Whether to reload langchain_modes.pkl file that contains any new user collections.
+    :param allow_upload_to_my_data: Whether to allow file uploads to update personal vector db
+    :param enable_url_upload: Whether to allow upload from URL
+    :param enable_text_upload: Whether to allow upload of text
+    :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
+    :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
+    :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so needs to be in context length
+    :param top_k_docs: For langchain_action query: number of chunks to give LLM
+                       -1 : auto-fills context up to max_seq_len
+                       For langchain_action summarize: number of document parts, like pages for PDF.
+                       There's no such thing as chunks for summarization.
+                       -1 : auto-fills context up to max_seq_len
+    :param docs_ordering_type:
+        Type of ordering of docs.
+        'best_first': Order by score so score is worst match near prompt
+        'best_near_prompt' or 'reverse_sort' : reverse docs order so most relevant is closest to question.
+           Best choice for sufficiently smart model, and truncation occurs for oldest context, so best then too.
+           But smaller 6_9 models fail to use newest context and can get stuck on old information.
+        '' or None (i.e. default) or 'reverse_ucurve_sort' : Sort so most relevant is either near start or near end
+           Best to avoid "lost in middle" as well as avoid hallucinating off starting content that LLM focuses on alot.
+    :param auto_reduce_chunks: Whether to automatically reduce top_k_docs to fit context given prompt
+    :param max_chunks: If top_k_docs=-1, maximum number of chunks to allow
+    :param headsize: Maximum number of characters for head of document document for UI to show
+    :param n_jobs: Number of processors to use when consuming documents (-1 = all, is default)
+
+    :param use_unstructured: Enable unstructured URL loader
+    :param use_playwright: Enable PlayWright URL loader
+    :param use_selenium: Enable Selenium URL loader
+
+    :param use_pymupdf: enable PyMUPDF 'auto' means use first, use others if they are 'auto' if no result
+    :param use_unstructured_pdf: enable Unstructured PDF loader, 'auto' means use if pymupdf fails to get doc result
+    :param use_pypdf: enable PyPDF loader 'auto' means use if unstructured fails to get doc result
+    :param enable_pdf_ocr: 'auto' means only use OCR if normal text extraction fails.  Useful for pure image-based PDFs with text.
+                                  if enable_pdf_doctr == 'on' then don't do.
+                            'on' means always do OCR as additional parsing of same documents
+                            'off' means don't do OCR (e.g. because it's slow even if 'auto' only would trigger if nothing else worked)
+    :param enable_pdf_doctr: Whether to support doctr on pdfs, 'auto' means use do if failed to get doc result so far
+    :param try_pdf_as_html: Try "PDF" as if HTML file, in case web link has .pdf extension but really is just HTML
+
+    :param enable_ocr: Whether to support OCR on images
+    :param enable_doctr: Whether to support doctr on images (using OCR better than enable_ocr=True)
+    :param enable_pix2struct: Whether to support pix2struct on images for captions
+    :param enable_captions: Whether to support captions using BLIP for image files as documents,
+           then preloads that model if pre_load_caption_model=True
+
+    :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader
+           parallel loading disabled if preload and have images, to prevent deadlocking on cuda context
+           Recommended if using larger caption model
+    :param captions_model: Which model to use for captions.
+           captions_model: str = "Salesforce/blip-image-captioning-base",  # continue capable
+           captions_model: str = "Salesforce/blip2-flan-t5-xl",   # question/answer capable, 16GB state
+           captions_model: str = "Salesforce/blip2-flan-t5-xxl",  # question/answer capable, 60GB state
+           Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions
+           Disabled for CPU since BLIP requires CUDA
+    :param caption_gpu: If support caption, then use GPU if exists
+
+    :param doctr_gpu: If support doctr, then use GPU if exists
+
+    :param jq_schema: control json loader
+           By default '.[]' ingests everything in brute-force way, but better to match your schema
+           See: https://python.langchain.com/docs/modules/data_connection/document_loaders/json#using-jsonloader
+
+    :param max_quality: Choose maximum quality ingestion with all available parsers
+           Pro: Catches document when some default parsers would fail
+           Pro: Enables DocTR that has much better OCR than Tesseract
+           Con: Fills DB with results from all parsers, so similarity search gives redundant results
+
+    :param enable_heap_analytics: Toggle telemetry.
+    :param heap_app_id: App ID for Heap, change to your ID.
+    :return:
+    """
+    if base_model is None:
+        base_model = ''
+    if tokenizer_base_model is None:
+        tokenizer_base_model = ''
+    if lora_weights is None:
+        lora_weights = ''
+    if inference_server is None:
+        inference_server = ''
+
+    # listen to env if set
+    model_lock = os.getenv('model_lock', str(model_lock))
+    model_lock = ast.literal_eval(model_lock)
+
+    chat_conversation = str_to_list(chat_conversation)
+    text_context_list = str_to_list(text_context_list)
+
+    llamacpp_dict = str_to_dict(llamacpp_dict)
+    # add others to single dict
+    llamacpp_dict['model_path_llama'] = model_path_llama
+    llamacpp_dict['model_name_gptj'] = model_name_gptj
+    llamacpp_dict['model_name_gpt4all_llama'] = model_name_gpt4all_llama
+    llamacpp_dict['model_name_exllama_if_no_config'] = model_name_exllama_if_no_config
+    # if user overrides but doesn't set these:
+    if 'n_batch' not in llamacpp_dict:
+        llamacpp_dict['n_batch'] = 128
+    if 'n_gpu_layers' not in llamacpp_dict:
+        llamacpp_dict['n_gpu_layers'] = 100
+    if 'n_gqa' not in llamacpp_dict:
+        llamacpp_dict['n_gqa'] = 0
+
+    if os.environ.get('SERPAPI_API_KEY') is None and LangChainAgent.SEARCH.value in visible_langchain_agents:
+        visible_langchain_agents.remove(LangChainAgent.SEARCH.value)
+
+    if model_lock:
+        assert gradio, "model_lock only supported for gradio=True"
+        assert not cli, "model_lock only supported for cli=False"
+        assert not (not cli and not gradio), "model_lock only supported for eval (cli=gradio=False)"
+        assert not base_model, "Don't specify model_lock and base_model"
+        assert not tokenizer_base_model, "Don't specify model_lock and tokenizer_base_model"
+        assert not lora_weights, "Don't specify model_lock and lora_weights"
+        assert not inference_server, "Don't specify model_lock and inference_server"
+        # assert not prompt_type, "Don't specify model_lock and prompt_type"
+        # assert not prompt_dict, "Don't specify model_lock and prompt_dict"
+
+    n_jobs = int(os.getenv('n_jobs', str(n_jobs)))
+    is_hf = bool(int(os.getenv("HUGGINGFACE_SPACES", '0')))
+    is_gpth2oai = bool(int(os.getenv("GPT_H2O_AI", '0')))
+    is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
+    if is_public:
+        visible_tos_tab = visible_hosts_tab = True
+        if enforce_h2ogpt_api_key is None:
+            enforce_h2ogpt_api_key = True
+    else:
+        if enforce_h2ogpt_api_key is None:
+            enforce_h2ogpt_api_key = False
+    if isinstance(h2ogpt_api_keys, str) and not os.path.isfile(h2ogpt_api_keys):
+        h2ogpt_api_keys = str_to_list(h2ogpt_api_keys)
+    if memory_restriction_level is None:
+        memory_restriction_level = 2 if is_hf else 0  # 2 assumes run on 24GB consumer GPU
+    else:
+        assert 0 <= memory_restriction_level <= 3, "Bad memory_restriction_level=%s" % memory_restriction_level
+    if n_jobs == -1:
+        # if -1, assume hypercores, don't use, force user to pass n_jobs to be specific if not standard cores
+        n_jobs = max(1, os.cpu_count() // 2)
+    if is_public and os.getenv('n_jobs') is None:
+        n_jobs = min(n_jobs, max(1, min(os.cpu_count() // 2, 8)))
+    admin_pass = os.getenv("ADMIN_PASS")
+    # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
+    # but becomes unrecoverable sometimes if raise, so just be silent for now
+    raise_generate_gpu_exceptions = True
+
+    rope_scaling = str_to_dict(rope_scaling)
+
+    if isinstance(auth, str):
+        if auth.strip().startswith('['):
+            auth = str_to_list(auth)
+    if isinstance(auth, str) and auth:
+        auth_filename = auth
+    if not auth_filename:
+        auth_filename = "auth.json"
+    assert isinstance(auth, (str, list, tuple, type(None))), "Unknown type %s for auth=%s" % (type(auth), auth)
+
+    # allow set token directly
+    use_auth_token = os.environ.get("HUGGING_FACE_HUB_TOKEN", use_auth_token)
+    allow_upload_to_user_data = bool(
+        int(os.environ.get("allow_upload_to_user_data", str(int(allow_upload_to_user_data)))))
+    allow_upload_to_my_data = bool(int(os.environ.get("allow_upload_to_my_data", str(int(allow_upload_to_my_data)))))
+    height = int(os.environ.get("HEIGHT", height))
+    h2ocolors = bool(int(os.getenv('h2ocolors', h2ocolors)))
+
+    # allow enabling langchain via ENV
+    # FIRST PLACE where LangChain referenced, but no imports related to it
+    langchain_modes = ast.literal_eval(os.environ.get("langchain_modes", str(langchain_modes)))
+    if not isinstance(langchain_modes, list):
+        langchain_modes = []
+    # always allow DISABLED
+    if LangChainMode.DISABLED.value not in langchain_modes:
+        langchain_modes.append(LangChainMode.DISABLED.value)
+
+    # update
+    langchain_mode_paths = str_to_dict(langchain_mode_paths)
+    langchain_mode_types = str_to_dict(langchain_mode_types)
+    for lmode in [LangChainMode.GITHUB_H2OGPT.value,
+                  LangChainMode.H2O_DAI_DOCS.value,
+                  LangChainMode.WIKI.value,
+                  LangChainMode.WIKI_FULL.value,
+                  ]:
+        if lmode not in langchain_mode_types:
+            langchain_mode_types[lmode] = 'shared'
+    if lmode not in langchain_mode_paths:
+        langchain_mode_types[lmode] = ''
+    if user_path:
+        user_path = makedirs(user_path, use_base=True)
+        langchain_mode_paths['UserData'] = user_path
+        langchain_mode_paths['UserData'] = LangChainTypes.SHARED.value
+
+    if is_public:
+        allow_upload_to_user_data = False
+        if LangChainMode.USER_DATA.value in langchain_modes:
+            langchain_modes.remove(LangChainMode.USER_DATA.value)
+    if max_raw_chunks is None:
+        max_raw_chunks = 30 if is_public else 1000000
+
+    # in-place, for non-scratch dbs
+    if allow_upload_to_user_data:
+        # always listen to CLI-passed user_path if passed
+        if user_path:
+            langchain_mode_paths['UserData'] = user_path
+
+    assert langchain_action in langchain_actions, "Invalid langchain_action %s not in %s" % (
+        langchain_action, langchain_actions)
+    assert len(
+        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
+
+    # auto-set langchain_mode
+    langchain_mode = os.environ.get("LANGCHAIN_MODE", langchain_mode)
+    if have_langchain and langchain_mode is None:
+        # start in chat mode, in case just want to chat and don't want to get "No documents to query" by default.
+        if LangChainMode.LLM.value in langchain_modes:
+            langchain_mode = LangChainMode.LLM.value
+        elif len(langchain_modes) >= 1:
+            # infer even if don't pass which langchain_mode, just langchain_modes.
+            langchain_mode = langchain_modes[0]
+        if allow_upload_to_user_data and not is_public and langchain_mode_paths['UserData']:
+            if verbose:
+                print("Auto set langchain_mode=%s.  Could use UserData instead." % langchain_mode, flush=True)
+        elif allow_upload_to_my_data:
+            if verbose:
+                print("Auto set langchain_mode=%s.  Could use MyData instead."
+                      "  To allow UserData to pull files from disk,"
+                      " set user_path or langchain_mode_paths, and ensure allow_upload_to_user_data=True" % langchain_mode,
+                      flush=True)
+        else:
+            raise RuntimeError("Please pass --langchain_mode=<chosen mode> out of %s" % langchain_modes)
+    if not have_langchain and langchain_mode not in [None, LangChainMode.DISABLED.value, LangChainMode.LLM.value]:
+        raise RuntimeError("Asked for LangChain mode but langchain python package cannot be found.")
+    if langchain_mode is None:
+        # if not set yet, disable
+        langchain_mode = LangChainMode.DISABLED.value
+        print("Auto set langchain_mode=%s  Have langchain package: %s" % (langchain_mode, have_langchain), flush=True)
+    # go ahead and add
+    if langchain_mode not in langchain_modes:
+        langchain_modes.append(langchain_mode)
+
+    if is_public:
+        allow_upload_to_user_data = False
+        input_lines = 1  # ensure set, for ease of use
+        temperature = 0.2 if temperature is None else temperature
+        top_p = 0.85 if top_p is None else top_p
+        top_k = 70 if top_k is None else top_k
+        if is_hf:
+            do_sample = True if do_sample is None else do_sample
+            top_k_docs = 3 if top_k_docs is None else top_k_docs
+        else:
+            # by default don't sample, too chatty
+            do_sample = False if do_sample is None else do_sample
+            top_k_docs = 4 if top_k_docs is None else top_k_docs
+
+        if memory_restriction_level == 2:
+            if not base_model and not inference_server and not model_lock:
+                base_model = 'h2oai/h2ogpt-oasst1-512-12b'
+                # don't set load_8bit if passed base_model, doesn't always work so can't just override
+                load_8bit = True
+                load_4bit = False  # FIXME - consider using 4-bit instead of 8-bit
+        elif not inference_server:
+            top_k_docs = 10 if top_k_docs is None else top_k_docs
+    if memory_restriction_level >= 2:
+        load_8bit = True
+        load_4bit = False  # FIXME - consider using 4-bit instead of 8-bit
+        if hf_embedding_model is None:
+            hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+        top_k_docs = 3 if top_k_docs is None else top_k_docs
+    if top_k_docs is None:
+        top_k_docs = 3
+    if is_public:
+        if not max_time:
+            max_time = 60 * 2
+        if not max_max_time:
+            max_max_time = max_time
+        if not max_new_tokens:
+            max_new_tokens = 256
+        if not max_max_new_tokens:
+            max_max_new_tokens = 512
+    else:
+        if not max_max_time:
+            max_max_time = 60 * 20
+        if not max_max_new_tokens:
+            max_max_new_tokens = 1024
+    if is_hf:
+        # must override share if in spaces
+        share = False
+        if not max_time:
+            max_time = 60 * 1
+        if not max_max_time:
+            max_max_time = max_time
+        # HF accounted for later in get_max_max_new_tokens()
+    save_dir = os.getenv('SAVE_DIR', save_dir)
+    save_dir = makedirs(save_dir, exist_ok=True, tmp_ok=True, use_base=True)
+    score_model = os.getenv('SCORE_MODEL', score_model)
+    if str(score_model) == 'None':
+        score_model = ''
+    concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
+    api_open = bool(int(os.getenv('API_OPEN', str(int(api_open)))))
+    allow_api = bool(int(os.getenv('ALLOW_API', str(int(allow_api)))))
+
+    n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    n_gpus, gpu_ids = cuda_vis_check(n_gpus)
+
+    if load_half is None and t5_type(base_model):
+        load_half = False
+        print("load_half=%s auto-set for %s to avoid bad generation" % (load_half, base_model), flush=True)
+
+    if n_gpus == 0 or get_device() == "mps":
+        # No CUDA GPUs usable
+
+        if get_device() != "mps":
+            print("No GPUs detected", flush=True)
+
+        enable_captions = False
+        gpu_id = None
+        load_8bit = False
+        load_4bit = False
+        low_bit_mode = 1
+        if load_half is None:
+            # wouldn't work if specified True, but respect
+            load_half = False
+        load_gptq = ''
+        load_exllama = False
+        use_gpu_id = False
+        if get_device() == "cuda":
+            torch.backends.cudnn.benchmark = True
+            torch.backends.cudnn.enabled = False
+            torch.set_default_dtype(torch.float32)
+        if is_public and not inference_server and not model_lock:
+            # 12B uses ~94GB
+            # 6.9B uses ~47GB
+            base_model = 'h2oai/h2ogpt-oig-oasst1-512-6_9b' if not base_model else base_model
+        if hf_embedding_model is None:
+            # if no GPUs, use simpler embedding model to avoid cost in time
+            hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+        if score_model == 'auto':
+            score_model = ''
+    else:
+        if load_half is None:
+            load_half = True
+        # CUDA GPUs visible
+        if score_model == 'auto':
+            if n_gpus >= 2:
+                # will by default place scoring model on last GPU
+                score_model = 'OpenAssistant/reward-model-deberta-v3-large-v2'
+            else:
+                score_model = ''
+        if hf_embedding_model is None:
+            # if still None, then set default
+            hf_embedding_model = 'hkunlp/instructor-large'
+
+    # get defaults
+    if base_model:
+        model_lower = base_model.lower()
+    elif model_lock:
+        # have 0th model be thought of as normal model
+        assert len(model_lock) > 0 and model_lock[0]['base_model']
+        model_lower = model_lock[0]['base_model'].lower()
+    else:
+        model_lower = ''
+    if not gradio:
+        # force, else not single response like want to look at
+        stream_output = False
+        # else prompt removal can mess up output
+        chat = False
+    # hard-coded defaults
+    first_para = False
+    text_limit = None
+
+    if compile_model is None:
+        # too avoid noisy CLI
+        compile_model = not cli
+
+    if offload_folder:
+        offload_folder = makedirs(offload_folder, exist_ok=True, tmp_ok=True, use_base=True)
+
+    # defaults
+    caption_loader = None
+    doctr_loader = None
+    pix2struct_loader = None
+
+    image_loaders_options0, image_loaders_options, \
+        pdf_loaders_options0, pdf_loaders_options, \
+        url_loaders_options0, url_loaders_options = lg_to_gr(**locals())
+    jq_schema0 = jq_schema
+    # transcribe
+    image_loaders = image_loaders_options0
+    pdf_loaders = pdf_loaders_options0
+    url_loaders = url_loaders_options0
+
+    placeholder_instruction, placeholder_input, \
+        stream_output, show_examples, \
+        prompt_type, prompt_dict, \
+        temperature, top_p, top_k, num_beams, \
+        max_new_tokens, min_new_tokens, early_stopping, max_time, \
+        repetition_penalty, num_return_sequences, \
+        do_sample, \
+        src_lang, tgt_lang, \
+        examples, \
+        task_info = \
+        get_generate_params(model_lower,
+                            chat,
+                            stream_output, show_examples,
+                            prompt_type, prompt_dict,
+                            system_prompt,
+                            pre_prompt_query, prompt_query,
+                            pre_prompt_summary, prompt_summary,
+                            temperature, top_p, top_k, num_beams,
+                            max_new_tokens, min_new_tokens, early_stopping, max_time,
+                            repetition_penalty, num_return_sequences,
+                            do_sample,
+                            top_k_docs,
+                            chunk,
+                            chunk_size,
+                            image_loaders,
+                            pdf_loaders,
+                            url_loaders,
+                            jq_schema,
+                            docs_ordering_type,
+                            min_max_new_tokens,
+                            verbose,
+                            )
+
+    git_hash = get_githash() if is_public or os.getenv('GET_GITHASH') else "GET_GITHASH"
+    locals_dict = locals()
+    locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
+    if verbose:
+        print(f"Generating model with params:\n{locals_print}", flush=True)
+        print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), git_hash), flush=True)
+
+    if langchain_mode != "Disabled":
+        # SECOND PLACE where LangChain referenced, but all imports are kept local so not required
+        from gpt_langchain import prep_langchain, get_some_dbs_from_hf, get_persist_directory
+        if is_hf:
+            get_some_dbs_from_hf()
+        dbs = {}
+        for langchain_mode1 in langchain_modes:
+            langchain_type = langchain_mode_types.get(langchain_mode1, LangChainTypes.EITHER.value)
+            if langchain_type == LangChainTypes.PERSONAL.value:
+                # shouldn't prepare per-user databases here
+                continue
+            persist_directory1, langchain_type = get_persist_directory(langchain_mode1, langchain_type=langchain_type)
+            langchain_mode_types[langchain_mode1] = langchain_type
+            if langchain_type == LangChainTypes.PERSONAL.value:
+                # shouldn't prepare per-user databases here
+                continue
+            try:
+                db = prep_langchain(persist_directory1,
+                                    load_db_if_exists,
+                                    db_type, use_openai_embedding,
+                                    langchain_mode1, langchain_mode_paths, langchain_mode_types,
+                                    hf_embedding_model,
+                                    migrate_embedding_model,
+                                    auto_migrate_db,
+                                    kwargs_make_db=locals(),
+                                    verbose=verbose)
+            finally:
+                # in case updated embeddings or created new embeddings
+                clear_torch_cache()
+            dbs[langchain_mode1] = db
+        # remove None db's so can just rely upon k in dbs for if hav db
+        dbs = {k: v for k, v in dbs.items() if v is not None}
+    else:
+        dbs = {}
+        # import control
+        if os.environ.get("TEST_LANGCHAIN_IMPORT"):
+            assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
+            assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
+
+    other_model_state_defaults = dict(load_8bit=load_8bit, load_4bit=load_4bit, low_bit_mode=low_bit_mode,
+                                      load_half=load_half,
+                                      load_gptq=load_gptq, load_exllama=load_exllama, use_safetensors=use_safetensors,
+                                      revision=revision, use_gpu_id=use_gpu_id, gpu_id=gpu_id,
+                                      compile_model=compile_model,
+                                      use_cache=use_cache,
+                                      llamacpp_dict=llamacpp_dict, model_path_llama=model_path_llama,
+                                      model_name_gptj=model_name_gptj,
+                                      model_name_gpt4all_llama=model_name_gpt4all_llama,
+                                      model_name_exllama_if_no_config=model_name_exllama_if_no_config,
+                                      )
+    model_state_none = dict(model=None, tokenizer=None, device=None,
+                            base_model=None, tokenizer_base_model=None, lora_weights=None,
+                            inference_server=None, prompt_type=None, prompt_dict=None,
+                            visible_models=None, h2ogpt_key=None,
+                            )
+    model_state_none.update(other_model_state_defaults)
+    my_db_state0 = {LangChainMode.MY_DATA.value: [None, None, None]}
+    selection_docs_state0 = dict(langchain_modes=langchain_modes,
+                                 langchain_mode_paths=langchain_mode_paths,
+                                 langchain_mode_types=langchain_mode_types)
+    selection_docs_state = copy.deepcopy(selection_docs_state0)
+
+    if cli or not gradio:
+        # initial state for query prompt
+        model_name = base_model
+        pre_prompt_query, prompt_query, pre_prompt_summary, prompt_summary = \
+            get_langchain_prompts(pre_prompt_query, prompt_query,
+                                  pre_prompt_summary, prompt_summary,
+                                  model_name, inference_server,
+                                  model_path_llama)
+
+    if cli:
+        from cli import run_cli
+        return run_cli(**get_kwargs(run_cli, exclude_names=['model_state0'], **locals()))
+    elif not gradio:
+        from eval import run_eval
+        return run_eval(**get_kwargs(run_eval, exclude_names=['model_state0'], **locals()))
+    elif gradio or prepare_offline_level > 0:
+        # imported here so don't require gradio to run generate
+        from gradio_runner import go_gradio
+
+        # get default model
+        model_states = []
+        model_list = [dict(base_model=base_model, tokenizer_base_model=tokenizer_base_model, lora_weights=lora_weights,
+                           inference_server=inference_server, prompt_type=prompt_type, prompt_dict=prompt_dict,
+                           visible_models=None, h2ogpt_key=None)]
+        model_list[0].update(other_model_state_defaults)
+        # FIXME: hyper per model, not about model loading
+        # for k in gen_hyper:
+        #     model_list[k] = locals()[k]
+
+        model_list0 = copy.deepcopy(model_list)  # just strings, safe to deepcopy
+        model_state0 = model_state_none.copy()
+        assert len(model_state_none) == len(model_state0)
+        if model_lock:
+            model_list = model_lock
+        # do reverse, so first is default base_model etc., so some logic works in go_gradio() more easily
+        for model_dict in reversed(model_list):
+            # handle defaults user didn't have to pass
+            # special defaults, ignore defaults for these if not specifically set, replace with ''
+            model_dict['base_model'] = model_dict.get('base_model', '')
+            model_dict['tokenizer_base_model'] = model_dict.get('tokenizer_base_model', '')
+            model_dict['lora_weights'] = model_dict.get('lora_weights', '')
+            model_dict['inference_server'] = model_dict.get('inference_server', '')
+            if prepare_offline_level >= 2:
+                if 'openai' not in model_dict['inference_server'] and 'replicate' not in model_dict['inference_server']:
+                    # assume want locally, but OpenAI and replicate are never local for model part
+                    model_dict['inference_server'] = ''
+            prompt_type_infer = not model_dict.get('prompt_type')
+            model_dict['prompt_type'] = model_dict.get('prompt_type',
+                                                       model_list0[0]['prompt_type'])  # don't use mutated value
+            # rest of generic defaults
+            for k in model_list0[0]:
+                if k not in model_dict:
+                    model_dict[k] = model_list0[0][k]
+
+            # begin prompt adjustments
+            # get query prompt for (say) last base model if using model lock
+            pre_prompt_query1, prompt_query1, pre_prompt_summary1, prompt_summary1 = (
+                get_langchain_prompts(pre_prompt_query, prompt_query,
+                                      pre_prompt_summary, prompt_summary,
+                                      model_dict['base_model'],
+                                      model_dict['inference_server'],
+                                      model_dict['model_path_llama']))
+            # if mixed setup, choose non-empty so best models best
+            # FIXME: Make per model dict passed through to evaluate
+            pre_prompt_query = pre_prompt_query or pre_prompt_query1
+            prompt_query = prompt_query or prompt_query1
+            pre_prompt_summary = pre_prompt_summary or pre_prompt_summary1
+            prompt_summary = prompt_summary or prompt_summary1
+
+            # try to infer, ignore empty initial state leading to get_generate_params -> 'plain'
+            if prompt_type_infer:
+                model_lower1 = model_dict['base_model'].lower()
+                if model_lower1 in inv_prompt_type_to_model_lower:
+                    model_dict['prompt_type'] = inv_prompt_type_to_model_lower[model_lower1]
+                    model_dict['prompt_dict'], error0 = get_prompt(model_dict['prompt_type'], '',
+                                                                   chat=False, context='', reduced=False,
+                                                                   making_context=False,
+                                                                   return_dict=True,
+                                                                   system_prompt=system_prompt)
+                else:
+                    model_dict['prompt_dict'] = prompt_dict
+            else:
+                model_dict['prompt_dict'] = prompt_dict
+            model_dict['prompt_dict'] = model_dict.get('prompt_dict', model_dict['prompt_dict'])
+            # end prompt adjustments
+            all_kwargs = locals().copy()
+            all_kwargs.update(model_dict)
+            if model_dict['base_model'] and not login_mode_if_model0:
+                model0, tokenizer0, device = get_model(reward_type=False,
+                                                       **get_kwargs(get_model, exclude_names=['reward_type'],
+                                                                    **all_kwargs))
+            else:
+                # if empty model, then don't load anything, just get gradio up
+                model0, tokenizer0, device = None, None, None
+            if model0 is None:
+                if fail_if_cannot_connect:
+                    raise RuntimeError("Could not connect, see logs")
+                # skip
+                if isinstance(model_lock, list):
+                    model_lock.remove(model_dict)
+                continue
+            model_state_trial = dict(model=model0, tokenizer=tokenizer0, device=device)
+            model_state_trial.update(model_dict)
+            diff_keys = set(list(model_state_none.keys())).symmetric_difference(model_state_trial.keys())
+            assert len(model_state_none) == len(model_state_trial), diff_keys
+            print("Model %s" % model_dict, flush=True)
+            if model_lock:
+                # last in iteration will be first
+                model_states.insert(0, model_state_trial)
+                # fill model_state0 so go_gradio() easier, manage model_states separately
+                model_state0 = model_state_trial.copy()
+            else:
+                model_state0 = model_state_trial.copy()
+            assert len(model_state_none) == len(model_state0)
+
+        visible_models = str_to_list(visible_models, allow_none=True)  # None means first model
+        all_models = [x.get('base_model', xi) for xi, x in enumerate(model_states)]
+        visible_models_state0 = [x.get('base_model', xi) for xi, x in enumerate(model_states) if
+                                 visible_models is None or
+                                 x.get('base_model', xi) in visible_models or
+                                 xi in visible_models]
+
+        # update to be consistent with what is passed from CLI and model chose
+        # do after go over all models if multi-model, so don't contaminate
+        # This is just so UI shows reasonable correct value, not 2048 dummy value
+        if len(model_states) >= 1:
+            max_seq_len = model_states[0]['tokenizer'].model_max_length
+
+        # get score model
+        all_kwargs = locals().copy()
+        smodel, stokenizer, sdevice = get_score_model(reward_type=True,
+                                                      **get_kwargs(get_score_model, exclude_names=['reward_type'],
+                                                                   **all_kwargs))
+        score_model_state0 = dict(model=smodel, tokenizer=stokenizer, device=sdevice,
+                                  base_model=score_model, tokenizer_base_model='', lora_weights='',
+                                  inference_server='', prompt_type='', prompt_dict='')
+
+        if enable_captions:
+            if pre_load_caption_model:
+                from image_captions import H2OImageCaptionLoader
+                caption_loader = H2OImageCaptionLoader(caption_gpu=caption_gpu).load_model()
+            else:
+                caption_loader = 'gpu' if n_gpus > 0 and caption_gpu else 'cpu'
+        else:
+            caption_loader = False
+
+        if pre_load_embedding_model and langchain_mode != 'Disabled' and not use_openai_embedding:
+            from src.gpt_langchain import get_embedding
+            hf_embedding_model = dict(name=hf_embedding_model,
+                                      model=get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model,
+                                                          preload=True))
+        if enable_doctr or enable_pdf_ocr in [True, 'auto', 'on']:
+            doctr_loader = 'gpu' if n_gpus > 0 and doctr_gpu else 'cpu'
+        else:
+            doctr_loader = False
+
+        # assume gradio needs everything
+        go_gradio(**locals())
+
+
+def get_config(base_model,
+               use_auth_token=False,
+               trust_remote_code=True,
+               offload_folder=None,
+               revision=None,
+               rope_scaling=None,
+               triton_attn=False,
+               long_sequence=True,
+               return_model=False,
+               raise_exception=False,
+               max_seq_len=None,
+               verbose=False,
+               ):
+    from accelerate import init_empty_weights
+    with init_empty_weights():
+        from transformers import AutoConfig
+        try:
+            config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
+                                                trust_remote_code=trust_remote_code,
+                                                offload_folder=offload_folder,
+                                                revision=revision,
+                                                rope_scaling=rope_scaling if rope_scaling else None)
+        except OSError as e:
+            if raise_exception:
+                raise
+            if 'not a local folder and is not a valid model identifier listed on' in str(
+                    e) or '404 Client Error' in str(e) or "couldn't connect" in str(e):
+                # e.g. llama, gpjt, etc.
+                # e.g. HF TGI but not model on HF or private etc.
+                if max_seq_len is None and base_model.lower() in non_hf_types:
+                    print("Could not determine --max_seq_len, setting to 2048.  Pass if not correct", flush=True)
+                    max_seq_len = 2048
+                # HF TGI server only should really require prompt_type, not HF model state
+                return None, None, max_seq_len
+            else:
+                raise
+        if triton_attn and 'mpt-' in base_model.lower():
+            config.attn_config['attn_impl'] = 'triton'
+        if long_sequence:
+            if 'mpt-7b-storywriter' in base_model.lower():
+                config.update({"max_seq_len": 83968})
+            if 'mosaicml/mpt-7b-chat' in base_model.lower():
+                config.update({"max_seq_len": 4096})
+            if 'mpt-30b' in base_model.lower():
+                config.update({"max_seq_len": 2 * 8192})
+        if return_model and \
+                issubclass(config.__class__, tuple(AutoModel._model_mapping.keys())):
+            model = AutoModel.from_config(
+                config,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            # can't infer
+            model = None
+    if 'falcon' in base_model.lower():
+        config.use_cache = False
+
+    # allow override
+    if max_seq_len is not None:
+        print("Overriding max_seq_len -> %d" % max_seq_len, flush=True)
+    else:
+        if hasattr(config, 'max_seq_len'):
+            max_seq_len = int(config.max_seq_len)
+        elif hasattr(config, 'max_position_embeddings') and isinstance(config.max_position_embeddings, int):
+            # help automatically limit inputs to generate
+            max_seq_len = config.max_position_embeddings
+            if verbose:
+                print("Used max_position_embeddings=%s as base model (pre-rope) max_seq_len."
+                      "  If not desired, pass --max_seq_len and set to some integer value." % config.max_position_embeddings,
+                      flush=True)
+        elif hasattr(config, 'n_ctx'):
+            # e.g. gpt2
+            max_seq_len = int(config.n_ctx)
+        else:
+            print("Could not determine --max_seq_len, setting to 2048.  Pass if not correct", flush=True)
+            max_seq_len = 2048
+            # FIXME:
+            # raise RuntimeError("Could not determine max_seq_len,"
+            #                   " please pass --max_seq_len and set to some value, e.g. 2048.")
+
+        if rope_scaling:
+            if rope_scaling.get('factor'):
+                # HF transformers
+                max_seq_len *= rope_scaling.get('factor')
+            elif rope_scaling.get('alpha_value'):
+                # exllama
+                # Note: exllama's own tokenizer has this set correctly in loaders.py, this config will be unused
+                max_seq_len *= rope_scaling.get('alpha_value')
+            print("Automatically setting max_seq_len=%d for RoPE scaling" % max_seq_len, flush=True)
+
+    return config, model, max_seq_len
+
+
+def get_non_lora_model(base_model, model_loader, load_half,
+                       load_gptq,
+                       load_exllama,
+                       use_safetensors,
+                       revision,
+                       model_kwargs, reward_type,
+                       config, model,
+                       gpu_id=0,
+                       ):
+    """
+    Ensure model gets on correct device
+    """
+
+    if model is not None:
+        # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
+        # NOTE: Some models require avoiding sharding some layers,
+        # then would pass no_split_module_classes and give list of those layers.
+        from accelerate import infer_auto_device_map
+        device_map = infer_auto_device_map(
+            model,
+            dtype=torch.float16 if load_half else torch.float32,
+        )
+        if hasattr(model, 'model'):
+            device_map_model = infer_auto_device_map(
+                model.model,
+                dtype=torch.float16 if load_half else torch.float32,
+            )
+            device_map.update(device_map_model)
+    else:
+        device_map = "auto"
+
+    n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    n_gpus, gpu_ids = cuda_vis_check(n_gpus)
+
+    if n_gpus > 0:
+        if gpu_id >= 0:
+            # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
+            # So avoid for now, just put on first GPU, unless score_model, put on last
+            if reward_type:
+                device_map = {'': n_gpus - 1}
+            else:
+                device_map = {'': min(n_gpus - 1, gpu_id)}
+        if gpu_id == -1:
+            device_map = {'': 'cuda'}
+    else:
+        device_map = {'': 'cpu'}
+        model_kwargs['load_in_8bit'] = False
+        model_kwargs['load_in_4bit'] = False
+    print('device_map: %s' % device_map, flush=True)
+
+    load_in_8bit = model_kwargs.get('load_in_8bit', False)
+    load_in_4bit = model_kwargs.get('load_in_4bit', False)
+    model_kwargs['device_map'] = device_map
+    model_kwargs['use_safetensors'] = use_safetensors
+    model_kwargs['revision'] = revision
+    pop_unused_model_kwargs(model_kwargs)
+
+    if load_exllama:
+        model = model_loader
+    elif load_gptq:
+        if 'Llama-2-70B-chat-GPTQ' in base_model:
+            model_kwargs.update(dict(inject_fused_attention=False))
+        model_kwargs.pop('torch_dtype', None)
+        model_kwargs.pop('device_map')
+        model = model_loader(
+            model_name_or_path=base_model,
+            model_basename=load_gptq,
+            **model_kwargs,
+        )
+    elif load_in_8bit or load_in_4bit or not load_half:
+        model = model_loader(
+            base_model,
+            config=config,
+            **model_kwargs,
+        )
+    else:
+
+        model = model_loader(
+            base_model,
+            config=config,
+            **model_kwargs,
+        )
+        if not getattr(model, "is_quantized", False):
+            model = model.half()
+    return model
+
+
+def get_client_from_inference_server(inference_server, base_model=None, raise_connection_exception=False):
+    inference_server, headers = get_hf_server(inference_server)
+    # preload client since slow for gradio case especially
+    from gradio_utils.grclient import GradioClient
+    gr_client = None
+    hf_client = None
+    if headers is None:
+        try:
+            print("GR Client Begin: %s %s" % (inference_server, base_model), flush=True)
+            # first do sanity check if alive, else gradio client takes too long by default
+            requests.get(inference_server, timeout=int(os.getenv('REQUEST_TIMEOUT', '30')))
+            gr_client = GradioClient(inference_server)
+            print("GR Client End: %s" % inference_server, flush=True)
+        except (OSError, ValueError) as e:
+            # Occurs when wrong endpoint and should have been HF client, so don't hard raise, just move to HF
+            gr_client = None
+            print("GR Client Failed %s %s: %s" % (inference_server, base_model, str(e)), flush=True)
+        except (ConnectTimeoutError, ConnectTimeout, MaxRetryError, ConnectionError, ConnectionError2,
+                JSONDecodeError, ReadTimeout2, KeyError) as e:
+            t, v, tb = sys.exc_info()
+            ex = ''.join(traceback.format_exception(t, v, tb))
+            print("GR Client Failed %s %s: %s" % (inference_server, base_model, str(ex)), flush=True)
+            if raise_connection_exception:
+                raise
+
+    if gr_client is None:
+        res = None
+        from text_generation import Client as HFClient
+        print("HF Client Begin: %s %s" % (inference_server, base_model))
+        try:
+            hf_client = HFClient(inference_server, headers=headers, timeout=int(os.getenv('REQUEST_TIMEOUT', '30')))
+            # quick check valid TGI endpoint
+            res = hf_client.generate('What?', max_new_tokens=1)
+            hf_client = HFClient(inference_server, headers=headers, timeout=300)
+        except (ConnectTimeoutError, ConnectTimeout, MaxRetryError, ConnectionError, ConnectionError2,
+                JSONDecodeError, ReadTimeout2, KeyError) as e:
+            hf_client = None
+            t, v, tb = sys.exc_info()
+            ex = ''.join(traceback.format_exception(t, v, tb))
+            print("HF Client Failed %s %s: %s" % (inference_server, base_model, str(ex)))
+            if raise_connection_exception:
+                raise
+        print("HF Client End: %s %s : %s" % (inference_server, base_model, res))
+    return inference_server, gr_client, hf_client
+
+
+def get_model(
+        load_8bit: bool = False,
+        load_4bit: bool = False,
+        low_bit_mode: int = 1,
+        load_half: bool = True,
+        load_gptq: str = '',
+        load_exllama: bool = False,
+        use_safetensors: bool = False,
+        revision: str = None,
+        use_gpu_id: bool = True,
+        base_model: str = '',
+        inference_server: str = "",
+        tokenizer_base_model: str = '',
+        lora_weights: str = "",
+        gpu_id: int = 0,
+        n_jobs=None,
+
+        reward_type: bool = None,
+        local_files_only: bool = False,
+        resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,
+        trust_remote_code: bool = True,
+        offload_folder: str = None,
+        rope_scaling: dict = None,
+        max_seq_len: int = None,
+        compile_model: bool = True,
+        llamacpp_dict=None,
+
+        verbose: bool = False,
+):
+    """
+
+    :param load_8bit: load model in 8-bit, not supported by all models
+    :param load_4bit: load model in 4-bit, not supported by all models
+    :param low_bit_mode: See gen.py
+    :param load_half: load model in 16-bit
+    :param load_gptq: GPTQ model_basename
+    :param load_exllama: whether to use exllama
+    :param use_safetensors: use safetensors file
+    :param revision:
+    :param use_gpu_id: Use torch infer of optimal placement of layers on devices (for non-lora case)
+           For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches
+           So it is not the default
+    :param base_model: name/path of base model
+    :param inference_server: whether base_model is hosted locally ('') or via http (url)
+    :param tokenizer_base_model: name/path of tokenizer
+    :param lora_weights: name/path
+    :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1)
+    :param n_jobs: number of cores to use (e.g. for llama CPU model)
+    :param reward_type: reward type model for sequence classification
+    :param local_files_only: use local files instead of from HF
+    :param resume_download: resume downloads from HF
+    :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
+    :param trust_remote_code: trust code needed by model
+    :param offload_folder: offload folder
+    :param rope_scaling: scaling for rope-based models, e.g. "{'type':'dynamic', 'factor':4}"
+    :param max_seq_len: override for maximum sequence length for model
+    :param max_seq_len: if set, use as max_seq_len for model
+    :param compile_model: whether to compile torch model
+    :param llamacpp_dict: dict of llama.cpp and GPT4All model options
+    :param verbose:
+    :return:
+    """
+    print("Starting get_model: %s %s" % (base_model, inference_server), flush=True)
+
+    triton_attn = False
+    long_sequence = True
+    config_kwargs = dict(use_auth_token=use_auth_token,
+                         trust_remote_code=trust_remote_code,
+                         offload_folder=offload_folder,
+                         rope_scaling=rope_scaling,
+                         triton_attn=triton_attn,
+                         long_sequence=long_sequence,
+                         revision=revision,
+                         max_seq_len=max_seq_len,
+                         verbose=verbose)
+    config, _, max_seq_len = get_config(base_model, **config_kwargs, raise_exception=False)
+
+    if base_model in non_hf_types:
+        assert config is None, "Expected config None for %s" % base_model
+
+    llama_type_from_config = 'llama' in str(config).lower()
+    llama_type_from_name = "llama" in base_model.lower()
+    llama_type = llama_type_from_config or llama_type_from_name
+    if "xgen" in base_model.lower() or 'llama2' in base_model.lower() or 'llama-2' in base_model.lower():
+        llama_type = False
+    if llama_type:
+        if verbose:
+            print("Detected as llama type from"
+                  " config (%s) or name (%s)" % (llama_type_from_config, llama_type_from_name), flush=True)
+
+    model_name_exllama_if_no_config = '' if not llamacpp_dict else llamacpp_dict.get('model_name_exllama_if_no_config',
+                                                                                     '')
+    model_loader, tokenizer_loader, conditional_type = (
+        get_loaders(model_name=base_model, reward_type=reward_type, llama_type=llama_type,
+                    load_gptq=load_gptq, load_exllama=load_exllama, config=config,
+                    rope_scaling=rope_scaling, max_seq_len=max_seq_len,
+                    model_name_exllama_if_no_config=model_name_exllama_if_no_config))
+
+    tokenizer_kwargs = dict(local_files_only=local_files_only,
+                            resume_download=resume_download,
+                            use_auth_token=use_auth_token,
+                            trust_remote_code=trust_remote_code,
+                            offload_folder=offload_folder,
+                            revision=revision,
+                            padding_side='left',
+                            config=config,
+                            )
+    if not tokenizer_base_model:
+        tokenizer_base_model = base_model
+
+    if load_exllama:
+        tokenizer = tokenizer_loader
+    elif config is not None and tokenizer_loader is not None and not isinstance(tokenizer_loader, str):
+        if load_exllama:
+            tokenizer = tokenizer_loader
+        else:
+            tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model, **tokenizer_kwargs)
+            # sets raw (no cushion) limit
+            # If using RoPE with scaling, then for non-exllama models (e.g. HF models),
+            #  then config -> tokenizer will set model_max_length correctly
+            set_model_max_len(max_seq_len, tokenizer, verbose=False)
+            # if using fake tokenizer, not really accurate when lots of numbers, give a bit of buffer, else get:
+            # Generation Failed: Input validation error: `inputs` must have less than 2048 tokens. Given: 2233
+            tokenizer.model_max_length = tokenizer.model_max_length - 50
+    else:
+        tokenizer = None
+
+    if isinstance(inference_server, str) and inference_server.startswith("http"):
+        inference_server, gr_client, hf_client = get_client_from_inference_server(inference_server,
+                                                                                  base_model=base_model)
+        client = gr_client or hf_client
+        # Don't return None, None for model, tokenizer so triggers
+        if tokenizer is None:
+            # FIXME: Could use only tokenizer from llama etc. but hard to detatch from model, just use fake for now
+            if os.getenv("HARD_ASSERTS") and base_model not in non_hf_types:
+                raise RuntimeError("Unexpected tokenizer=None")
+            tokenizer = FakeTokenizer()
+        return client, tokenizer, 'http'
+    if isinstance(inference_server, str) and (
+            inference_server.startswith('openai') or
+            inference_server.startswith('vllm') or
+            inference_server.startswith('replicate') or
+            inference_server.startswith('sagemaker')
+    ):
+        if inference_server.startswith('openai'):
+            assert os.getenv('OPENAI_API_KEY'), "Set environment for OPENAI_API_KEY"
+            # Don't return None, None for model, tokenizer so triggers
+            # include small token cushion
+            max_seq_len = model_token_mapping[base_model]
+        if inference_server.startswith('replicate'):
+            assert len(inference_server.split(':')) >= 3, "Expected replicate:model string, got %s" % inference_server
+            assert os.getenv('REPLICATE_API_TOKEN'), "Set environment for REPLICATE_API_TOKEN"
+            assert max_seq_len is not None, "Please pass --max_seq_len=<max_seq_len> for replicate models."
+            try:
+                import replicate as replicate_python
+            except ImportError:
+                raise ImportError(
+                    "Could not import replicate python package. "
+                    "Please install it with `pip install replicate`."
+                )
+        if inference_server.startswith('sagemaker'):
+            assert len(
+                inference_server.split(
+                    ':')) >= 3, "Expected sagemaker_chat:<endpoint name>:<region>, got %s" % inference_server
+            assert os.getenv('AWS_ACCESS_KEY_ID'), "Set environment for AWS_ACCESS_KEY_ID"
+            assert os.getenv('AWS_SECRET_ACCESS_KEY'), "Set environment for AWS_SECRET_ACCESS_KEY"
+        # Don't return None, None for model, tokenizer so triggers
+        # include small token cushion
+        if inference_server.startswith('openai') or tokenizer is None:
+            # don't use fake (tiktoken) tokenizer for vLLM//replicate if know actual model with actual tokenizer
+            tokenizer = FakeTokenizer(model_max_length=max_seq_len - 50)
+        return inference_server, tokenizer, inference_server
+    assert not inference_server, "Malformed inference_server=%s" % inference_server
+    if base_model in non_hf_types:
+        from gpt4all_llm import get_model_tokenizer_gpt4all
+        model, tokenizer, device = get_model_tokenizer_gpt4all(base_model, n_jobs=n_jobs,
+                                                               max_seq_len=max_seq_len,
+                                                               llamacpp_dict=llamacpp_dict)
+        return model, tokenizer, device
+    if load_exllama:
+        return model_loader, tokenizer, 'cuda'
+
+    # get local torch-HF model
+    return get_hf_model(load_8bit=load_8bit,
+                        load_4bit=load_4bit,
+                        low_bit_mode=low_bit_mode,
+                        load_half=load_half,
+                        load_gptq=load_gptq,
+                        use_safetensors=use_safetensors,
+                        revision=revision,
+                        use_gpu_id=use_gpu_id,
+                        base_model=base_model,
+                        tokenizer_base_model=tokenizer_base_model,
+                        lora_weights=lora_weights,
+                        gpu_id=gpu_id,
+
+                        reward_type=reward_type,
+                        local_files_only=local_files_only,
+                        resume_download=resume_download,
+                        use_auth_token=use_auth_token,
+                        trust_remote_code=trust_remote_code,
+                        offload_folder=offload_folder,
+                        rope_scaling=rope_scaling,
+                        compile_model=compile_model,
+
+                        llama_type=llama_type,
+                        config_kwargs=config_kwargs,
+                        tokenizer_kwargs=tokenizer_kwargs,
+
+                        verbose=verbose)
+
+
+def get_hf_model(load_8bit: bool = False,
+                 load_4bit: bool = False,
+                 low_bit_mode: int = 1,
+                 load_half: bool = True,
+                 load_gptq: str = '',
+                 use_safetensors: bool = False,
+                 revision: str = None,
+                 use_gpu_id: bool = True,
+                 base_model: str = '',
+                 tokenizer_base_model: str = '',
+                 lora_weights: str = "",
+                 gpu_id: int = 0,
+
+                 reward_type: bool = None,
+                 local_files_only: bool = False,
+                 resume_download: bool = True,
+                 use_auth_token: Union[str, bool] = False,
+                 trust_remote_code: bool = True,
+                 offload_folder: str = None,
+                 rope_scaling: dict = None,
+                 compile_model: bool = True,
+
+                 llama_type: bool = False,
+                 config_kwargs=None,
+                 tokenizer_kwargs=None,
+
+                 verbose: bool = False,
+                 ):
+    assert config_kwargs is not None
+    assert tokenizer_kwargs is not None
+
+    load_exllama = False  # Never should be in HF code for exllama
+
+    if lora_weights is not None and lora_weights.strip():
+        if verbose:
+            print("Get %s lora weights" % lora_weights, flush=True)
+    device = get_device()
+
+    if 'gpt2' in base_model.lower():
+        # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
+        load_8bit = False
+        load_4bit = False
+
+    assert base_model.strip(), (
+        "Please choose a base model with --base_model (CLI) or load one from Models Tab (gradio)"
+    )
+
+    model_loader, tokenizer_loader, conditional_type = (
+        get_loaders(model_name=base_model, reward_type=reward_type, llama_type=llama_type,
+                    load_gptq=load_gptq, load_exllama=load_exllama))
+
+    config, _, max_seq_len = get_config(base_model, return_model=False, raise_exception=True, **config_kwargs)
+
+    if tokenizer_loader is not None and not isinstance(tokenizer_loader, str):
+        if load_exllama:
+            tokenizer = tokenizer_loader
+        else:
+            tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
+                                                         **tokenizer_kwargs)
+    else:
+        tokenizer = tokenizer_loader
+
+    if isinstance(tokenizer, str):
+        # already a pipeline, tokenizer_loader is string for task
+        model = model_loader(tokenizer,
+                             model=base_model,
+                             device=0 if device == "cuda" else -1,
+                             torch_dtype=torch.float16 if device == 'cuda' else torch.float32)
+    else:
+        assert device in ["cuda", "cpu", "mps"], "Unsupported device %s" % device
+        model_kwargs = dict(local_files_only=local_files_only,
+                            torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
+                            resume_download=resume_download,
+                            use_auth_token=use_auth_token,
+                            trust_remote_code=trust_remote_code,
+                            offload_folder=offload_folder,
+                            revision=revision,
+                            # rope_scaling=rope_scaling,  # only put into config
+                            )
+        if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower():
+            if use_gpu_id and gpu_id is not None and gpu_id >= 0 and device == 'cuda':
+                device_map = {"": gpu_id}
+            else:
+                device_map = "auto"
+            model_kwargs.update(dict(load_in_8bit=load_8bit,
+                                     load_in_4bit=load_4bit,
+                                     device_map=device_map,
+                                     ))
+        if 'mpt-' in base_model.lower() and gpu_id is not None and gpu_id >= 0:
+            # MPT doesn't support spreading over GPUs
+            model_kwargs.update(dict(device_map={"": gpu_id} if device == 'cuda' else "cpu"))
+
+        if 'OpenAssistant/reward-model'.lower() in base_model.lower():
+            # FIXME: could put on other GPUs
+            model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
+            model_kwargs.pop('torch_dtype', None)
+        pop_unused_model_kwargs(model_kwargs)
+
+        n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+        n_gpus, gpu_ids = cuda_vis_check(n_gpus)
+        if low_bit_mode == 1 and n_gpus != 0:
+            from transformers import BitsAndBytesConfig
+            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_compute_dtype=torch.bfloat16,
+                                                                     load_in_4bit=load_4bit,
+                                                                     load_in_8bit=load_8bit,
+                                                                     )
+        elif low_bit_mode == 2 and n_gpus != 0:
+            from transformers import BitsAndBytesConfig
+            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_quant_type="nf4",
+                                                                     load_in_4bit=load_4bit,
+                                                                     load_in_8bit=load_8bit,
+                                                                     )
+        elif low_bit_mode == 3 and n_gpus != 0:
+            from transformers import BitsAndBytesConfig
+            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_use_double_quant=True,
+                                                                     load_in_4bit=load_4bit,
+                                                                     load_in_8bit=load_8bit,
+                                                                     )
+        elif low_bit_mode == 4 and n_gpus != 0:
+            from transformers import BitsAndBytesConfig
+            model_kwargs['quantization_config'] = BitsAndBytesConfig(bnb_4bit_use_double_quant=True,
+                                                                     bnb_4bit_quant_type="nf4",
+                                                                     load_in_4bit=load_4bit,
+                                                                     load_in_8bit=load_8bit,
+                                                                     )
+
+        if not lora_weights:
+            # torch.device context uses twice memory for AutoGPTQ
+            context = NullContext if load_gptq else torch.device
+            with context(device):
+
+                if use_gpu_id:
+                    config, model, max_seq_len = get_config(base_model,
+                                                            return_model=True, raise_exception=True, **config_kwargs)
+                    model = get_non_lora_model(base_model, model_loader, load_half, load_gptq,
+                                               load_exllama,
+                                               use_safetensors,
+                                               revision,
+                                               model_kwargs, reward_type,
+                                               config, model,
+                                               gpu_id=gpu_id,
+                                               )
+                else:
+                    config, _, max_seq_len = get_config(base_model, **config_kwargs)
+                    if load_half and not (load_8bit or load_4bit or load_gptq):
+                        model = model_loader(
+                            base_model,
+                            config=config,
+                            **model_kwargs)
+                        if not getattr(model, "is_quantized", False):
+                            model = model.half()
+                    else:
+                        model = model_loader(
+                            base_model,
+                            config=config,
+                            **model_kwargs)
+        elif load_8bit or load_4bit:
+            config, _, max_seq_len = get_config(base_model, **config_kwargs)
+            model = model_loader(
+                base_model,
+                config=config,
+                **model_kwargs
+            )
+            from peft import PeftModel  # loads cuda, so avoid in global scope
+            model = PeftModel.from_pretrained(
+                model,
+                lora_weights,
+                torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
+                local_files_only=local_files_only,
+                resume_download=resume_download,
+                use_auth_token=use_auth_token,
+                trust_remote_code=trust_remote_code,
+                offload_folder=offload_folder,
+                rope_scaling=rope_scaling,
+                revision=revision,
+                device_map={"": 0} if device == 'cuda' else {"": 'cpu'},  # seems to be required
+            )
+        else:
+            with torch.device(device):
+                config, _, max_seq_len = get_config(base_model, raise_exception=True, **config_kwargs)
+                model = model_loader(
+                    base_model,
+                    config=config,
+                    **model_kwargs
+                )
+                from peft import PeftModel  # loads cuda, so avoid in global scope
+                model = PeftModel.from_pretrained(
+                    model,
+                    lora_weights,
+                    torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
+                    local_files_only=local_files_only,
+                    resume_download=resume_download,
+                    use_auth_token=use_auth_token,
+                    trust_remote_code=trust_remote_code,
+                    offload_folder=offload_folder,
+                    rope_scaling=rope_scaling,
+                    device_map="auto",
+                )
+                if load_half and not load_gptq:
+                    if not getattr(model, "is_quantized", False):
+                        model = model.half()
+
+    # unwind broken decapoda-research config
+    if llama_type:
+        model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
+        model.config.bos_token_id = 1
+        model.config.eos_token_id = 2
+    if 'gpt2' in base_model.lower():
+        # add special tokens that otherwise all share the same id
+        tokenizer.add_special_tokens({'bos_token': '<bos>',
+                                      'eos_token': '<eos>',
+                                      'pad_token': '<pad>'})
+
+    if not isinstance(tokenizer, str):
+        model.eval()
+        if torch.__version__ >= "2" and sys.platform != "win32" and compile_model:
+            model = torch.compile(model)
+
+    set_model_max_len(max_seq_len, tokenizer, verbose=False, reward_type=reward_type)
+
+    # tell if conditional type
+    model.conditional_type = conditional_type
+    tokenizer.conditional_type = conditional_type
+
+    return model, tokenizer, device
+
+
+def set_model_max_len(max_seq_len, tokenizer, verbose=False, reward_type=False):
+    if reward_type:
+        # limit deberta, else uses too much memory and not worth response score
+        tokenizer.model_max_length = 512
+        return
+
+    tokenizer.model_max_length = int(max_seq_len)
+    if verbose:
+        print("model_max_length=%s" % tokenizer.model_max_length, flush=True)
+    # for bug in HF transformers
+    if tokenizer.model_max_length > 100000000:
+        tokenizer.model_max_length = 2048
+
+
+def pop_unused_model_kwargs(model_kwargs):
+    """
+    in-place pop unused kwargs that are not dependency-upgrade friendly
+    no point passing in False, is default, and helps avoid needing to update requirements for new deps
+    :param model_kwargs:
+    :return:
+    """
+    check_list = ['load_in_8bit', 'load_in_4bit']
+    for k in check_list:
+        if k in model_kwargs and not model_kwargs[k]:
+            model_kwargs.pop(k)
+
+
+def get_score_model(score_model: str = None,
+                    load_8bit: bool = False,
+                    load_4bit: bool = False,
+                    low_bit_mode=1,
+                    load_half: bool = True,
+                    load_gptq: str = '',
+                    load_exllama: bool = False,
+                    use_gpu_id: bool = True,
+                    base_model: str = '',
+                    inference_server: str = '',
+                    tokenizer_base_model: str = '',
+                    lora_weights: str = "",
+                    gpu_id: int = 0,
+                    n_jobs=None,
+
+                    reward_type: bool = None,
+                    local_files_only: bool = False,
+                    resume_download: bool = True,
+                    use_auth_token: Union[str, bool] = False,
+                    trust_remote_code: bool = True,
+                    offload_folder: str = None,
+                    rope_scaling: dict = None,
+                    compile_model: bool = True,
+                    llamacpp_dict: typing.Dict = None,
+
+                    verbose: bool = False,
+                    ):
+    if score_model is not None and score_model.strip():
+        load_8bit = False
+        load_4bit = False
+        low_bit_mode = 1
+        load_half = False
+        load_gptq = ''
+        load_exllama = False
+        use_safetensors = False
+        revision = None
+        base_model = score_model.strip()
+        tokenizer_base_model = ''
+        lora_weights = ''
+        inference_server = ''
+        llama_type = False
+        max_seq_len = None
+        compile_model = False
+        llamacpp_dict = {}
+        smodel, stokenizer, sdevice = get_model(reward_type=True,
+                                                **get_kwargs(get_model, exclude_names=['reward_type'], **locals()))
+    else:
+        smodel, stokenizer, sdevice = None, None, None
+    return smodel, stokenizer, sdevice
+
+
+def evaluate_fake(*args, **kwargs):
+    yield dict(response=invalid_key_msg, sources='')
+    return
+
+
+def evaluate(
+        model_state,
+        my_db_state,
+        selection_docs_state,
+        requests_state,
+        # START NOTE: Examples must have same order of parameters
+        instruction,
+        iinput,
+        context,
+        stream_output,
+        prompt_type,
+        prompt_dict,
+        temperature,
+        top_p,
+        top_k,
+        num_beams,
+        max_new_tokens,
+        min_new_tokens,
+        early_stopping,
+        max_time,
+        repetition_penalty,
+        num_return_sequences,
+        do_sample,
+        chat,
+        instruction_nochat,
+        iinput_nochat,
+        langchain_mode,
+        add_chat_history_to_context,
+        langchain_action,
+        langchain_agents,
+        top_k_docs,
+        chunk,
+        chunk_size,
+        document_subset,
+        document_choice,
+        pre_prompt_query,
+        prompt_query,
+        pre_prompt_summary,
+        prompt_summary,
+        system_prompt,
+
+        image_loaders,
+        pdf_loaders,
+        url_loaders,
+        jq_schema,
+        visible_models,
+        h2ogpt_key,
+        add_search_to_context,
+        chat_conversation,
+        text_context_list,
+        docs_ordering_type,
+        min_max_new_tokens,
+
+        # END NOTE: Examples must have same order of parameters
+        captions_model=None,
+        caption_loader=None,
+        doctr_loader=None,
+        pix2struct_loader=None,
+        async_output=None,
+        num_async=None,
+        src_lang=None,
+        tgt_lang=None,
+        debug=False,
+        concurrency_count=None,
+        save_dir=None,
+        sanitize_bot_response=False,
+        model_state0=None,
+        memory_restriction_level=None,
+        max_max_new_tokens=None,
+        is_public=None,
+        max_max_time=None,
+        raise_generate_gpu_exceptions=None,
+        lora_weights=None,
+        use_llm_if_no_docs=True,
+        load_db_if_exists=True,
+        dbs=None,
+        detect_user_path_changes_every_query=None,
+        use_openai_embedding=None,
+        use_openai_model=None,
+        hf_embedding_model=None,
+        migrate_embedding_model=None,
+        auto_migrate_db=None,
+        cut_distance=None,
+        db_type=None,
+        n_jobs=None,
+        first_para=None,
+        text_limit=None,
+        show_accordions=None,
+        top_k_docs_max_show=None,
+        show_link_in_sources=None,
+        verbose=False,
+        cli=False,
+        use_cache=None,
+        auto_reduce_chunks=None,
+        max_chunks=None,
+        headsize=None,
+        model_lock=None,
+        force_langchain_evaluate=None,
+        model_state_none=None,
+        load_exllama=None,
+        answer_with_sources=None,
+        append_sources_to_answer=None,
+        image_loaders_options0=None,
+        pdf_loaders_options0=None,
+        url_loaders_options0=None,
+        jq_schema0=None,
+        keep_sources_in_context=None,
+):
+    # ensure passed these
+    assert concurrency_count is not None
+    assert memory_restriction_level is not None
+    assert raise_generate_gpu_exceptions is not None
+    assert use_openai_embedding is not None
+    assert use_openai_model is not None
+    assert hf_embedding_model is not None
+    assert migrate_embedding_model is not None
+    assert auto_migrate_db is not None
+    assert db_type is not None
+    assert top_k_docs is not None and isinstance(top_k_docs, int)
+    assert chunk is not None and isinstance(chunk, bool)
+    assert chunk_size is not None and isinstance(chunk_size, int)
+    assert n_jobs is not None
+    assert first_para is not None
+    assert isinstance(add_chat_history_to_context, bool)
+    assert isinstance(add_search_to_context, bool)
+    assert load_exllama is not None
+    # for lazy client (even chat client)
+    if image_loaders is None:
+        image_loaders = image_loaders_options0
+    if pdf_loaders is None:
+        pdf_loaders = pdf_loaders_options0
+    if url_loaders is None:
+        url_loaders = url_loaders_options0
+    if jq_schema is None:
+        jq_schema = jq_schema0
+    if isinstance(langchain_agents, str):
+        if langchain_agents.strip().startswith('['):
+            # already list, but as string
+            langchain_agents = str_to_list(langchain_agents)
+        else:
+            # just 1 item and make list
+            langchain_agents = [langchain_agents]
+    chat_conversation = str_to_list(chat_conversation)
+    text_context_list = str_to_list(text_context_list)
+
+    langchain_modes = selection_docs_state['langchain_modes']
+    langchain_mode_paths = selection_docs_state['langchain_mode_paths']
+    langchain_mode_types = selection_docs_state['langchain_mode_types']
+
+    if debug:
+        locals_dict = locals().copy()
+        locals_dict.pop('model_state', None)
+        locals_dict.pop('model_state0', None)
+        locals_dict.pop('model_states', None)
+        print(locals_dict)
+
+    no_model_msg = "Please choose a base model with --base_model (CLI) or load in Models Tab (gradio).\n" \
+                   "Then start New Conversation"
+
+    if model_state is None:
+        model_state = model_state_none.copy()
+    if model_state0 is None:
+        # e.g. for no gradio case, set dummy value, else should be set
+        model_state0 = model_state_none.copy()
+
+    # model_state['model] is only 'model' if should use model_state0
+    # model could also be None
+    have_model_lock = model_lock is not None
+    have_fresh_model = model_state['model'] not in [None, 'model', no_model_str]
+    # for gradio UI control, expect model_state and model_state0 to match, so if have_model_lock=True, then should have_fresh_model=True
+    # but gradio API control will only use nochat api etc. and won't use fresh model, so can't assert in general
+    # if have_model_lock:
+    #    assert have_fresh_model, "Expected model_state and model_state0 to match if have_model_lock"
+    have_cli_model = model_state0['model'] not in [None, 'model', no_model_str]
+
+    if have_fresh_model:
+        # USE FRESH MODEL
+        if not have_model_lock:
+            # model_state0 is just one of model_state if model_lock, so don't nuke
+            # try to free-up original model (i.e. list was passed as reference)
+            if model_state0['model'] and hasattr(model_state0['model'], 'cpu'):
+                model_state0['model'].cpu()
+                model_state0['model'] = None
+            # try to free-up original tokenizer (i.e. list was passed as reference)
+            if model_state0['tokenizer']:
+                model_state0['tokenizer'] = None
+            clear_torch_cache()
+        chosen_model_state = model_state
+    elif have_cli_model:
+        # USE MODEL SETUP AT CLI
+        assert isinstance(model_state['model'], (type(None), str))  # expect no fresh model
+        chosen_model_state = model_state0
+    else:
+        raise AssertionError(no_model_msg)
+    # get variables
+    model = chosen_model_state['model']
+    tokenizer = chosen_model_state['tokenizer']
+    device = chosen_model_state['device']
+    base_model = chosen_model_state['base_model']
+    tokenizer_base_model = chosen_model_state['tokenizer_base_model']
+    lora_weights = chosen_model_state['lora_weights']
+    inference_server = chosen_model_state['inference_server']
+    visible_models = chosen_model_state['visible_models']
+    # use overall key if have, so key for this gradio and any inner gradio
+    if chosen_model_state['h2ogpt_key'] is not None:
+        h2ogpt_key = chosen_model_state['h2ogpt_key']
+    # prefer use input from API over model state
+    prompt_type = prompt_type or chosen_model_state['prompt_type']
+    prompt_dict = prompt_dict or chosen_model_state['prompt_dict']
+
+    if base_model is None:
+        raise AssertionError(no_model_msg)
+
+    assert base_model.strip(), no_model_msg
+    assert model, "Model is missing"
+    assert tokenizer, "Tokenizer is missing"
+
+    # choose chat or non-chat mode
+    if not chat:
+        instruction = instruction_nochat
+        iinput = iinput_nochat
+
+    # in some cases, like lean nochat API, don't want to force sending prompt_type, allow default choice
+    model_lower = base_model.lower()
+    if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
+        prompt_type = inv_prompt_type_to_model_lower[model_lower]
+        if verbose:
+            print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True)
+    assert prompt_type is not None, "prompt_type was None"
+
+    # Control generation hyperparameters
+    # adjust for bad inputs, e.g. in case also come from API that doesn't get constrained by gradio sliders
+    # below is for TGI server, not required for HF transformers
+    # limits are chosen similar to gradio_runner.py sliders/numbers
+    top_p = min(max(1e-3, top_p), 1.0 - 1e-3)
+    top_k = min(max(1, int(top_k)), 100)
+    temperature = min(max(0.01, temperature), 2.0)
+    # FIXME: https://github.com/h2oai/h2ogpt/issues/106
+    num_beams = 1 if stream_output else num_beams  # See max_beams in gradio_runner
+    max_max_new_tokens = get_max_max_new_tokens(chosen_model_state,
+                                                memory_restriction_level=memory_restriction_level,
+                                                max_new_tokens=max_new_tokens,
+                                                max_max_new_tokens=max_max_new_tokens)
+    if min_max_new_tokens is None:
+        # default for nochat api
+        min_max_new_tokens = 256
+    if docs_ordering_type is None:
+        docs_ordering_type = 'reverse_ucurve_sort'
+    model_max_length = get_model_max_length(chosen_model_state)
+    max_new_tokens = min(max(1, int(max_new_tokens)), max_max_new_tokens)
+    min_new_tokens = min(max(0, int(min_new_tokens)), max_new_tokens)
+    max_time = min(max(0, max_time), max_max_time)
+    repetition_penalty = min(max(0.01, repetition_penalty), 3.0)
+    num_return_sequences = 1 if chat else min(max(1, int(num_return_sequences)), 10)
+    min_top_k_docs, max_top_k_docs, label_top_k_docs = get_minmax_top_k_docs(is_public)
+    # limit total tokens processed, e.g. for summarization, if public instance
+    if is_public:
+        total_tokens_for_docs = min(2 * model_max_length, 16384)
+    else:
+        total_tokens_for_docs = None
+    top_k_docs = min(max(min_top_k_docs, int(top_k_docs)), max_top_k_docs)
+    chunk_size = min(max(128, int(chunk_size)), 2048)
+    if not context:
+        context = ''
+
+    # get prompter
+    prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output,
+                        system_prompt=system_prompt)
+
+    # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use
+    assert langchain_mode in langchain_modes, "Invalid langchain_mode %s not in %s" % (langchain_mode, langchain_modes)
+    assert langchain_action in langchain_actions, "Invalid langchain_action %s not in %s" % (
+        langchain_action, langchain_actions)
+    assert len(
+        set(langchain_agents).difference(langchain_agents_list)) == 0, "Invalid langchain_agents %s" % langchain_agents
+
+    # get db, but also fill db state so return already has my_db_state and dbs filled so faster next query
+    from src.gpt_langchain import get_any_db
+    db = get_any_db(my_db_state, langchain_mode, langchain_mode_paths, langchain_mode_types,
+                    dbs=dbs,
+                    load_db_if_exists=load_db_if_exists,
+                    db_type=db_type,
+                    use_openai_embedding=use_openai_embedding,
+                    hf_embedding_model=hf_embedding_model,
+                    migrate_embedding_model=migrate_embedding_model,
+                    auto_migrate_db=auto_migrate_db,
+                    for_sources_list=True,
+                    verbose=verbose,
+                    n_jobs=n_jobs,
+                    )
+
+    t_generate = time.time()
+    langchain_only_model = base_model in non_hf_types or \
+                           load_exllama or \
+                           inference_server.startswith('replicate') or \
+                           inference_server.startswith('sagemaker') or \
+                           inference_server.startswith('openai_azure_chat') or \
+                           inference_server.startswith('openai_azure')
+    do_langchain_path = langchain_mode not in [False, 'Disabled', 'LLM'] or \
+                        langchain_only_model or \
+                        force_langchain_evaluate or \
+                        len(text_context_list) > 0
+
+    if len(langchain_agents) > 0:
+        do_langchain_path = True
+    if add_search_to_context:
+        # easier to manage prompt etc. by doing full langchain path
+        do_langchain_path = True
+
+    if do_langchain_path:
+        text = ''
+        sources = ''
+        response = ''
+        # use smaller cut_distance for wiki_full since so many matches could be obtained, and often irrelevant unless close
+        from gpt_langchain import run_qa_db
+        gen_hyper_langchain = dict(do_sample=do_sample,
+                                   temperature=temperature,
+                                   repetition_penalty=repetition_penalty,
+                                   top_k=top_k,
+                                   top_p=top_p,
+                                   num_beams=num_beams,
+                                   min_new_tokens=min_new_tokens,
+                                   max_new_tokens=max_new_tokens,
+                                   early_stopping=early_stopping,
+                                   max_time=max_time,
+                                   num_return_sequences=num_return_sequences,
+                                   )
+        loaders_dict, captions_model = gr_to_lg(image_loaders,
+                                                pdf_loaders,
+                                                url_loaders,
+                                                captions_model=captions_model,
+                                                )
+        loaders_dict.update(dict(captions_model=captions_model,
+                                 caption_loader=caption_loader,
+                                 doctr_loader=doctr_loader,
+                                 pix2struct_loader=pix2struct_loader,
+                                 jq_schema=jq_schema,
+                                 ))
+        data_point = dict(context=context, instruction=instruction, input=iinput)
+        # no longer stuff chat history directly into context this early
+        prompt_basic = prompter.generate_prompt(data_point, context_from_history=False)
+        prompt = prompt_basic
+        num_prompt_tokens = 0
+        for r in run_qa_db(
+                inference_server=inference_server,
+                model_name=base_model, model=model, tokenizer=tokenizer,
+                langchain_only_model=langchain_only_model,
+                async_output=async_output,
+                num_async=num_async,
+                prompter=prompter,
+                use_llm_if_no_docs=use_llm_if_no_docs,
+                load_db_if_exists=load_db_if_exists,
+                db=db,
+                langchain_mode_paths=langchain_mode_paths,
+                langchain_mode_types=langchain_mode_types,
+                detect_user_path_changes_every_query=detect_user_path_changes_every_query,
+                cut_distance=1.1 if langchain_mode in ['wiki_full'] else cut_distance,
+                answer_with_sources=answer_with_sources,
+                append_sources_to_answer=append_sources_to_answer,
+                add_chat_history_to_context=add_chat_history_to_context,
+                add_search_to_context=add_search_to_context,
+                keep_sources_in_context=keep_sources_in_context,
+                memory_restriction_level=memory_restriction_level,
+                system_prompt=system_prompt,
+                use_openai_embedding=use_openai_embedding,
+                use_openai_model=use_openai_model,
+                hf_embedding_model=hf_embedding_model,
+                migrate_embedding_model=migrate_embedding_model,
+                auto_migrate_db=auto_migrate_db,
+                first_para=first_para,
+                text_limit=text_limit,
+                show_accordions=show_accordions,
+                top_k_docs_max_show=top_k_docs_max_show,
+                show_link_in_sources=show_link_in_sources,
+
+                # evaluate args items
+                query=instruction,
+                iinput=iinput,
+                context=context,
+                stream_output=stream_output,
+                chunk=chunk,
+                chunk_size=chunk_size,
+
+                **loaders_dict,
+
+                langchain_mode=langchain_mode,
+                langchain_action=langchain_action,
+                langchain_agents=langchain_agents,
+                document_subset=document_subset,
+                document_choice=document_choice,
+                top_k_docs=top_k_docs,
+                prompt_type=prompt_type,
+                prompt_dict=prompt_dict,
+                pre_prompt_query=pre_prompt_query,
+                prompt_query=prompt_query,
+                pre_prompt_summary=pre_prompt_summary,
+                prompt_summary=prompt_summary,
+                text_context_list=text_context_list,
+                chat_conversation=chat_conversation,
+                visible_models=visible_models,
+                h2ogpt_key=h2ogpt_key,
+                docs_ordering_type=docs_ordering_type,
+                min_max_new_tokens=min_max_new_tokens,
+
+                **gen_hyper_langchain,
+
+                db_type=db_type,
+                n_jobs=n_jobs,
+                verbose=verbose,
+                cli=cli,
+                sanitize_bot_response=sanitize_bot_response,
+
+                lora_weights=lora_weights,
+
+                auto_reduce_chunks=auto_reduce_chunks,
+                max_chunks=max_chunks,
+                total_tokens_for_docs=total_tokens_for_docs,
+                headsize=headsize,
+        ):
+            # doesn't accumulate, new answer every yield, so only save that full answer
+            response = r['response']
+            sources = r['sources']
+            prompt = r['prompt']
+            num_prompt_tokens = r['num_prompt_tokens']
+            yield dict(response=response, sources=sources, save_dict=dict())
+        if save_dir:
+            # estimate using tiktoken
+            extra_dict = gen_hyper_langchain.copy()
+            extra_dict.update(prompt_type=prompt_type,
+                              inference_server=inference_server,
+                              langchain_mode=langchain_mode,
+                              langchain_action=langchain_action,
+                              langchain_agents=langchain_agents,
+                              document_subset=document_subset,
+                              document_choice=document_choice,
+                              chat_conversation=chat_conversation,
+                              add_search_to_context=add_search_to_context,
+                              num_prompt_tokens=num_prompt_tokens,
+                              instruction=instruction,
+                              iinput=iinput,
+                              context=context,
+                              t_generate=time.time() - t_generate,
+                              ntokens=None,
+                              tokens_persecond=None,
+                              )
+            save_dict = dict(prompt=prompt,
+                             output=response, base_model=base_model, save_dir=save_dir,
+                             where_from='run_qa_db',
+                             extra_dict=extra_dict)
+            yield dict(response=response, sources=sources, save_dict=save_dict)
+            if verbose:
+                print(
+                    'Post-Generate Langchain: %s decoded_output: %s' %
+                    (str(datetime.now()), len(response) if response else -1),
+                    flush=True)
+        if response or sources or langchain_only_model:
+            # if got no response (e.g. not showing sources and got no sources,
+            # so nothing to give to LLM), then slip through and ask LLM
+            # Or if llama/gptj, then just return since they had no response and can't go down below code path
+            # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it
+            return
+
+    # NOT LANGCHAIN PATH, raw LLM
+    # restrict instruction + , typically what has large input
+    prompt, \
+        instruction, iinput, context, \
+        num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
+        chat_index, top_k_docs_trial, one_doc_size = \
+        get_limited_prompt(instruction,
+                           iinput,
+                           tokenizer,
+                           prompter=prompter,
+                           inference_server=inference_server,
+                           # prompt_type=prompt_type,
+                           # prompt_dict=prompt_dict,
+                           # chat=chat,
+                           max_new_tokens=max_new_tokens,
+                           # system_prompt=system_prompt,
+                           context=context,
+                           chat_conversation=chat_conversation,
+                           keep_sources_in_context=keep_sources_in_context,
+                           model_max_length=model_max_length,
+                           memory_restriction_level=memory_restriction_level,
+                           langchain_mode=langchain_mode,
+                           add_chat_history_to_context=add_chat_history_to_context,
+                           min_max_new_tokens=min_max_new_tokens,
+                           )
+
+    if inference_server.startswith('vllm') or \
+            inference_server.startswith('openai') or \
+            inference_server.startswith('http'):
+        if inference_server.startswith('vllm') or inference_server.startswith('openai'):
+            assert not inference_server.startswith('openai_azure_chat'), "Not fo Azure, use langchain path"
+            assert not inference_server.startswith('openai_azure'), "Not for Azure, use langchain path"
+            openai, inf_type, deployment_name, base_url, api_version = set_openai(inference_server)
+            where_from = inf_type
+
+            terminate_response = prompter.terminate_response or []
+            stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
+            stop_sequences = [x for x in stop_sequences if x]
+            # OpenAI will complain if ask for too many new tokens, takes it as min in some sense, wrongly so.
+            max_new_tokens_openai = min(max_new_tokens, model_max_length - num_prompt_tokens)
+            gen_server_kwargs = dict(temperature=temperature if do_sample else 0,
+                                     max_tokens=max_new_tokens_openai,
+                                     top_p=top_p if do_sample else 1,
+                                     frequency_penalty=0,
+                                     n=num_return_sequences,
+                                     presence_penalty=1.07 - repetition_penalty + 0.6,  # so good default
+                                     )
+            if inf_type == 'vllm' or inference_server == 'openai':
+                responses = openai.Completion.create(
+                    model=base_model,
+                    prompt=prompt,
+                    **gen_server_kwargs,
+                    stop=stop_sequences,
+                    stream=stream_output,
+                )
+                text = ''
+                sources = ''
+                response = ''
+                if not stream_output:
+                    text = responses['choices'][0]['text']
+                    response = prompter.get_response(prompt + text, prompt=prompt,
+                                                     sanitize_bot_response=sanitize_bot_response)
+                    yield dict(response=response, sources=sources, save_dict=dict())
+                else:
+                    collected_events = []
+                    for event in responses:
+                        collected_events.append(event)  # save the event response
+                        event_text = event['choices'][0]['text']  # extract the text
+                        text += event_text  # append the text
+                        response = prompter.get_response(prompt + text, prompt=prompt,
+                                                         sanitize_bot_response=sanitize_bot_response)
+                        yield dict(response=response, sources=sources, save_dict=dict())
+            elif inf_type == 'vllm_chat' or inference_server == 'openai_chat':
+                if inf_type == 'vllm_chat':
+                    raise NotImplementedError('%s not supported by vLLM' % inf_type)
+                if system_prompt in [None, 'None', 'auto']:
+                    openai_system_prompt = "You are a helpful assistant."
+                else:
+                    openai_system_prompt = system_prompt
+                messages0 = []
+                if openai_system_prompt:
+                    messages0.append({"role": "system", "content": openai_system_prompt})
+                messages0.append({'role': 'user', 'content': prompt})
+                responses = openai.ChatCompletion.create(
+                    model=base_model,
+                    messages=messages0,
+                    stream=stream_output,
+                    **gen_server_kwargs,
+                )
+                text = ""
+                sources = ''
+                response = ""
+                if not stream_output:
+                    text = responses["choices"][0]["message"]["content"]
+                    response = prompter.get_response(prompt + text, prompt=prompt,
+                                                     sanitize_bot_response=sanitize_bot_response)
+                    yield dict(response=response, sources=sources, save_dict=dict())
+                else:
+                    for chunk in responses:
+                        delta = chunk["choices"][0]["delta"]
+                        if 'content' in delta:
+                            text += delta['content']
+                            response = prompter.get_response(prompt + text, prompt=prompt,
+                                                             sanitize_bot_response=sanitize_bot_response)
+                            yield dict(response=response, sources=sources, save_dict=dict())
+            else:
+                raise RuntimeError("No such OpenAI mode: %s" % inference_server)
+        elif inference_server.startswith('http'):
+            inference_server, headers = get_hf_server(inference_server)
+            from gradio_utils.grclient import GradioClient
+            from text_generation import Client as HFClient
+            if isinstance(model, GradioClient):
+                gr_client = model
+                hf_client = None
+            elif isinstance(model, HFClient):
+                gr_client = None
+                hf_client = model
+            else:
+                inference_server, gr_client, hf_client = get_client_from_inference_server(inference_server,
+                                                                                          base_model=base_model)
+
+            # quick sanity check to avoid long timeouts, just see if can reach server
+            requests.get(inference_server, timeout=int(os.getenv('REQUEST_TIMEOUT_FAST', '10')))
+
+            if gr_client is not None:
+                # Note: h2oGPT gradio server could handle input token size issues for prompt,
+                # but best to handle here so send less data to server
+
+                chat_client = False
+                where_from = "gr_client"
+                client_langchain_mode = 'Disabled'
+                client_add_chat_history_to_context = True
+                client_add_search_to_context = False
+                client_langchain_action = LangChainAction.QUERY.value
+                client_langchain_agents = []
+                gen_server_kwargs = dict(temperature=temperature,
+                                         top_p=top_p,
+                                         top_k=top_k,
+                                         num_beams=num_beams,
+                                         max_new_tokens=max_new_tokens,
+                                         min_new_tokens=min_new_tokens,
+                                         early_stopping=early_stopping,
+                                         max_time=max_time,
+                                         repetition_penalty=repetition_penalty,
+                                         num_return_sequences=num_return_sequences,
+                                         do_sample=do_sample,
+                                         chat=chat_client,
+                                         )
+                # account for gradio into gradio that handles prompting, avoid duplicating prompter prompt injection
+                if prompt_type in [None, '', PromptType.plain.name, PromptType.plain.value,
+                                   str(PromptType.plain.value)]:
+                    # if our prompt is plain, assume either correct or gradio server knows different prompt type,
+                    # so pass empty prompt_Type
+                    gr_prompt_type = ''
+                    gr_prompt_dict = ''
+                    gr_prompt = prompt  # already prepared prompt
+                    gr_context = ''
+                    gr_iinput = ''
+                else:
+                    # if already have prompt_type that is not plain, None, or '', then already applied some prompting
+                    #  But assume server can handle prompting, and need to avoid double-up.
+                    #  Also assume server can do better job of using stopping.py to stop early, so avoid local prompting, let server handle
+                    #  So avoid "prompt" and let gradio server reconstruct from prompt_type we passed
+                    # Note it's ok that prompter.get_response() has prompt+text, prompt=prompt passed,
+                    #  because just means extra processing and removal of prompt, but that has no human-bot prompting doesn't matter
+                    #  since those won't appear
+                    gr_context = context
+                    gr_prompt = instruction
+                    gr_iinput = iinput
+                    gr_prompt_type = prompt_type
+                    gr_prompt_dict = prompt_dict
+                client_kwargs = dict(instruction=gr_prompt if chat_client else '',  # only for chat=True
+                                     iinput=gr_iinput,  # only for chat=True
+                                     context=gr_context,
+                                     # streaming output is supported, loops over and outputs each generation in streaming mode
+                                     # but leave stream_output=False for simple input/output mode
+                                     stream_output=stream_output,
+
+                                     **gen_server_kwargs,
+
+                                     prompt_type=gr_prompt_type,
+                                     prompt_dict=gr_prompt_dict,
+
+                                     instruction_nochat=gr_prompt if not chat_client else '',
+                                     iinput_nochat=gr_iinput,  # only for chat=False
+                                     langchain_mode=client_langchain_mode,
+                                     add_chat_history_to_context=client_add_chat_history_to_context,
+                                     langchain_action=client_langchain_action,
+                                     langchain_agents=client_langchain_agents,
+                                     top_k_docs=top_k_docs,
+                                     chunk=chunk,
+                                     chunk_size=chunk_size,
+                                     document_subset=DocumentSubset.Relevant.name,
+                                     document_choice=[DocumentChoice.ALL.value],
+                                     pre_prompt_query=pre_prompt_query,
+                                     prompt_query=prompt_query,
+                                     pre_prompt_summary=pre_prompt_summary,
+                                     prompt_summary=prompt_summary,
+                                     system_prompt=system_prompt,
+                                     image_loaders=image_loaders,
+                                     pdf_loaders=pdf_loaders,
+                                     url_loaders=url_loaders,
+                                     jq_schema=jq_schema,
+                                     visible_models=visible_models,
+                                     h2ogpt_key=h2ogpt_key,
+                                     add_search_to_context=client_add_search_to_context,
+                                     docs_ordering_type=None,
+                                     min_max_new_tokens=min_max_new_tokens,
+                                     )
+                api_name = '/submit_nochat_api'  # NOTE: like submit_nochat but stable API for string dict passing
+                response = ''
+                text = ''
+                sources = ''
+                if not stream_output:
+                    res = gr_client.predict(str(dict(client_kwargs)), api_name=api_name)
+                    res_dict = ast.literal_eval(res)
+                    text = res_dict['response']
+                    sources = res_dict['sources']
+                    response = prompter.get_response(prompt + text, prompt=prompt,
+                                                     sanitize_bot_response=sanitize_bot_response)
+                    yield dict(response=response, sources=sources, save_dict=dict())
+                else:
+                    job = gr_client.submit(str(dict(client_kwargs)), api_name=api_name)
+                    res_dict = dict(response=text, sources=sources, save_dict=dict())
+                    text0 = ''
+                    while not job.done():
+                        if job.communicator.job.latest_status.code.name == 'FINISHED':
+                            break
+                        e = job.future._exception
+                        if e is not None:
+                            break
+                        outputs_list = job.communicator.job.outputs
+                        if outputs_list:
+                            res = job.communicator.job.outputs[-1]
+                            res_dict = ast.literal_eval(res)
+                            text = res_dict['response']
+                            sources = res_dict['sources']
+                            if gr_prompt_type == 'plain':
+                                # then gradio server passes back full prompt + text
+                                prompt_and_text = text
+                            else:
+                                prompt_and_text = prompt + text
+                            response = prompter.get_response(prompt_and_text, prompt=prompt,
+                                                             sanitize_bot_response=sanitize_bot_response)
+                            text_chunk = response[len(text0):]
+                            if not text_chunk:
+                                continue
+                            # save old
+                            text0 = response
+                            yield dict(response=response, sources=sources, save_dict=dict())
+                        time.sleep(0.01)
+                    # ensure get last output to avoid race
+                    res_all = job.outputs()
+                    if len(res_all) > 0:
+                        res = res_all[-1]
+                        res_dict = ast.literal_eval(res)
+                        text = res_dict['response']
+                        sources = res_dict['sources']
+                    else:
+                        # go with old text if last call didn't work
+                        e = job.future._exception
+                        if e is not None:
+                            stre = str(e)
+                            strex = ''.join(traceback.format_tb(e.__traceback__))
+                        else:
+                            stre = ''
+                            strex = ''
+
+                        print("Bad final response: %s %s %s %s %s: %s %s" % (base_model, inference_server,
+                                                                             res_all, prompt, text, stre, strex),
+                              flush=True)
+                    if gr_prompt_type == 'plain':
+                        # then gradio server passes back full prompt + text
+                        prompt_and_text = text
+                    else:
+                        prompt_and_text = prompt + text
+                    response = prompter.get_response(prompt_and_text, prompt=prompt,
+                                                     sanitize_bot_response=sanitize_bot_response)
+                    yield dict(response=response, sources=sources, save_dict=dict())
+            elif hf_client:
+                # HF inference server needs control over input tokens
+                where_from = "hf_client"
+                response = ''
+                extra = ''
+                sources = ''
+
+                # prompt must include all human-bot like tokens, already added by prompt
+                # https://github.com/huggingface/text-generation-inference/tree/main/clients/python#types
+                terminate_response = prompter.terminate_response or []
+                stop_sequences = list(set(terminate_response + [prompter.PreResponse]))
+                stop_sequences = [x for x in stop_sequences if x]
+                gen_server_kwargs = dict(do_sample=do_sample,
+                                         max_new_tokens=max_new_tokens,
+                                         # best_of=None,
+                                         repetition_penalty=repetition_penalty,
+                                         return_full_text=False,
+                                         seed=SEED,
+                                         stop_sequences=stop_sequences,
+                                         temperature=temperature,
+                                         top_k=top_k,
+                                         top_p=top_p,
+                                         # truncate=False,  # behaves oddly
+                                         # typical_p=top_p,
+                                         # watermark=False,
+                                         # decoder_input_details=False,
+                                         )
+                # work-around for timeout at constructor time, will be issue if multi-threading,
+                # so just do something reasonable or max_time if larger
+                # lower bound because client is re-used if multi-threading
+                hf_client.timeout = max(300, max_time)
+                if not stream_output:
+                    text = hf_client.generate(prompt, **gen_server_kwargs).generated_text
+                    response = prompter.get_response(prompt + text, prompt=prompt,
+                                                     sanitize_bot_response=sanitize_bot_response)
+                    yield dict(response=response, sources=sources, save_dict=dict())
+                else:
+                    text = ""
+                    for responses in hf_client.generate_stream(prompt, **gen_server_kwargs):
+                        if not responses.token.special:
+                            # stop_sequences
+                            text_chunk = responses.token.text
+                            text += text_chunk
+                            response = prompter.get_response(prompt + text, prompt=prompt,
+                                                             sanitize_bot_response=sanitize_bot_response)
+                            sources = ''
+                            yield dict(response=response, sources=sources, save_dict=dict())
+            else:
+                raise RuntimeError("Failed to get client: %s" % inference_server)
+        else:
+            raise RuntimeError("No such inference_server  %s" % inference_server)
+
+        if save_dir and text:
+            # save prompt + new text
+            extra_dict = gen_server_kwargs.copy()
+            extra_dict.update(dict(inference_server=inference_server, num_prompt_tokens=num_prompt_tokens,
+                                   t_generate=time.time() - t_generate,
+                                   ntokens=None,
+                                   tokens_persecond=None,
+                                   ))
+            save_dict = dict(prompt=prompt, output=text, base_model=base_model, save_dir=save_dir,
+                             where_from=where_from, extra_dict=extra_dict)
+            yield dict(response=response, sources=sources, save_dict=save_dict)
+        return
+    else:
+        assert not inference_server, "inference_server=%s not supported" % inference_server
+
+    if isinstance(tokenizer, str):
+        # pipeline
+        if tokenizer == "summarization":
+            key = 'summary_text'
+        else:
+            raise RuntimeError("No such task type %s" % tokenizer)
+        # NOTE: uses max_length only
+        sources = ''
+        yield dict(response=model(prompt, max_length=max_new_tokens)[0][key], sources=sources, save_dict=dict())
+
+    if 'mbart-' in base_model.lower():
+        assert src_lang is not None
+        tokenizer.src_lang = languages_covered()[src_lang]
+
+    stopping_criteria = get_stopping(prompt_type, prompt_dict, tokenizer, device, base_model,
+                                     model_max_length=model_max_length,
+                                     prompter=prompter)
+
+    inputs = tokenizer(prompt, return_tensors="pt")
+    if debug and len(inputs["input_ids"]) > 0:
+        print('input_ids length', len(inputs["input_ids"][0]), flush=True)
+    input_ids = inputs["input_ids"].to(device)
+    # CRITICAL LIMIT else will fail
+    max_max_tokens = tokenizer.model_max_length
+    max_input_tokens = max(0, int(max_max_tokens - min_new_tokens))
+    # NOTE: Don't limit up front due to max_new_tokens, let go up to max or reach max_max_tokens in stopping.py
+    assert isinstance(max_input_tokens, int), "Bad type for max_input_tokens=%s %s" % (
+        max_input_tokens, type(max_input_tokens))
+    input_ids = input_ids[:, -max_input_tokens:]
+    # required for falcon if multiple threads or asyncio accesses to model during generation
+    if use_cache is None:
+        use_cache = False if 'falcon' in base_model else True
+    gen_config_kwargs = dict(num_beams=num_beams,
+                             do_sample=do_sample,
+                             repetition_penalty=float(repetition_penalty),
+                             num_return_sequences=num_return_sequences,
+                             renormalize_logits=True,
+                             remove_invalid_values=True,
+                             use_cache=use_cache,
+                             )
+    if do_sample:
+        gen_config_kwargs.update(dict(temperature=float(temperature),
+                                      top_p=float(top_p),
+                                      top_k=top_k))
+    if True:
+        # unclear impact, some odd things going on inside
+        # leads to:
+        # The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
+        # Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
+        # or leads to:
+        # Using cls_token, but it is not set yet.
+        # Using mask_token, but it is not set yet.
+        # Using pad_token, but it is not set yet.
+        # Using sep_token, but it is not set yet.
+        token_ids = ['eos_token_id', 'pad_token_id', 'bos_token_id', 'cls_token_id', 'sep_token_id']
+        for token_id in token_ids:
+            if hasattr(tokenizer, token_id) and getattr(tokenizer, token_id) is not None:
+                gen_config_kwargs.update({token_id: getattr(tokenizer, token_id)})
+    generation_config = GenerationConfig(**gen_config_kwargs)
+
+    gen_kwargs = dict(input_ids=input_ids,
+                      generation_config=generation_config,
+                      return_dict_in_generate=True,
+                      output_scores=True,
+                      max_new_tokens=max_new_tokens,  # prompt + new
+                      min_new_tokens=min_new_tokens,  # prompt + new
+                      early_stopping=early_stopping,  # False, True, "never"
+                      max_time=max_time,
+                      stopping_criteria=stopping_criteria,
+                      )
+    if 'gpt2' in base_model.lower():
+        gen_kwargs.update(dict(bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.eos_token_id))
+    elif 'mbart-' in base_model.lower():
+        assert tgt_lang is not None
+        tgt_lang = languages_covered()[tgt_lang]
+        gen_kwargs.update(dict(forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]))
+    else:
+        token_ids = ['eos_token_id', 'bos_token_id', 'pad_token_id']
+        for token_id in token_ids:
+            if hasattr(tokenizer, token_id) and getattr(tokenizer, token_id) is not None:
+                gen_kwargs.update({token_id: getattr(tokenizer, token_id)})
+
+    decoder_kwargs = dict(skip_special_tokens=True,
+                          clean_up_tokenization_spaces=True)
+
+    decoder = functools.partial(tokenizer.decode,
+                                **decoder_kwargs
+                                )
+    with torch.no_grad():
+        have_lora_weights = lora_weights not in [no_lora_str, '', None]
+        context_class_cast = NullContext if device == 'cpu' or have_lora_weights or device == 'mps' else torch.autocast
+        if t5_type(base_model):
+            # issues when casting to float16, can mess up t5 model, e.g. only when not streaming, or other odd behaviors
+            context_class_cast = NullContext
+        with context_class_cast(device):
+            # protection for gradio not keeping track of closed users,
+            # else hit bitsandbytes lack of thread safety:
+            # https://github.com/h2oai/h2ogpt/issues/104
+            # but only makes sense if concurrency_count == 1
+            context_class = NullContext  # if concurrency_count > 1 else filelock.FileLock
+            if verbose:
+                print('Pre-Generate: %s' % str(datetime.now()), flush=True)
+            decoded_output = None
+            response = ''
+            with context_class("generate.lock"):
+                if verbose:
+                    print('Generate: %s' % str(datetime.now()), flush=True)
+                always_use_streaming_method = True  # to deal with complex parsing of prompt vs. generation due to odd tokenizing
+                if stream_output or always_use_streaming_method:
+                    skip_prompt = True  # True means first output excludes prompt
+                    streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False,
+                                                       **decoder_kwargs)
+                    gen_kwargs.update(dict(streamer=streamer))
+                    target = wrapped_partial(generate_with_exceptions, model.generate,
+                                             raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
+                                             **gen_kwargs)
+                    bucket = queue.Queue()
+                    thread = EThread(target=target, streamer=streamer, bucket=bucket)
+                    thread.start()
+                    ret = dict(response='', sources='', save_dict=dict())
+                    outputs = ""
+                    sources = ''
+                    try:
+                        for new_text in streamer:
+                            if bucket.qsize() > 0 or thread.exc:
+                                thread.join()
+                            outputs += new_text
+                            response = prompter.get_response(outputs, prompt=None,
+                                                             only_new_text=True,
+                                                             sanitize_bot_response=sanitize_bot_response)
+                            ret = dict(response=response, sources=sources, save_dict=dict())
+                            if stream_output:
+                                yield ret
+                        if not stream_output:
+                            yield ret
+                    except BaseException:
+                        # if any exception, raise that exception if was from thread, first
+                        if thread.exc:
+                            raise thread.exc
+                        raise
+                    finally:
+                        # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it
+                        # in case no exception and didn't join with thread yet, then join
+                        if not thread.exc:
+                            thread.join()
+                    # in case raise StopIteration or broke queue loop in streamer, but still have exception
+                    if thread.exc:
+                        raise thread.exc
+                    decoded_output = outputs
+                    ntokens = len(outputs) // 4  # hack for now
+                else:
+                    # below length removal doesn't work in general, because encoding does not match internal of model generation
+                    input_ids_len = gen_kwargs['input_ids'][0].shape[0]
+                    try:
+                        outputs = model.generate(**gen_kwargs)
+                    finally:
+                        pass
+                        # don't clear torch cache here, delays multi-generation, and bot(), all_bot(), and evaluate_nochat() do it
+                    # skip first IDs
+                    ntokens = sum([len(s) - input_ids_len for s in outputs.sequences]) if save_dir else -1
+                    outputs = [decoder(s[input_ids_len:]) for s in outputs.sequences]
+                    sources = ''
+                    response = prompter.get_response(outputs, prompt=None,
+                                                     only_new_text=True,
+                                                     sanitize_bot_response=sanitize_bot_response)
+                    yield dict(response=response, sources=sources, save_dict=dict())
+                    if outputs and len(outputs) >= 1:
+                        decoded_output = prompt + outputs[0]
+                if save_dir and decoded_output:
+                    extra_dict = gen_config_kwargs.copy()
+                    extra_dict.update(dict(num_prompt_tokens=num_prompt_tokens,
+                                           t_generate=time.time() - t_generate,
+                                           ntokens=ntokens,
+                                           tokens_persecond=ntokens / (time.time() - t_generate),
+                                           ))
+                    save_dict = dict(prompt=prompt, output=decoded_output, base_model=base_model, save_dir=save_dir,
+                                     where_from="evaluate_%s" % str(stream_output),
+                                     extra_dict=extra_dict)
+                    yield dict(response=response, sources=sources, save_dict=save_dict)
+            if verbose:
+                print('Post-Generate: %s decoded_output: %s' % (
+                    str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
+
+
+inputs_list_names = list(inspect.signature(evaluate).parameters)
+state_names = input_args_list.copy()  # doesn't have to be the same, but state_names must match evaluate() and how filled then
+inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names]
+
+
+def get_cutoffs(memory_restriction_level, for_context=False, model_max_length=2048):
+    # help to avoid errors like:
+    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
+    # RuntimeError: expected scalar type Half but found Float
+    # with - 256
+    if memory_restriction_level > 0:
+        max_length_tokenize = 768 - 256 if memory_restriction_level <= 2 else 512 - 256
+    else:
+        # at least give room for 1 paragraph output
+        max_length_tokenize = model_max_length - 256
+    cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
+    output_smallest = 30 * 4
+    max_prompt_length = cutoff_len - output_smallest
+
+    if for_context:
+        # then lower even more to avoid later chop, since just estimate tokens in context bot
+        max_prompt_length = max(64, int(max_prompt_length * 0.8))
+
+    return cutoff_len, output_smallest, max_length_tokenize, max_prompt_length
+
+
+class H2OTextIteratorStreamer(TextIteratorStreamer):
+    """
+    normally, timeout required for now to handle exceptions, else get()
+    but with H2O version of TextIteratorStreamer, loop over block to handle
+    """
+
+    def __init__(self, tokenizer, skip_prompt: bool = False, timeout: typing.Optional[float] = None,
+                 block=True, **decode_kwargs):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = queue.Queue()
+        self.stop_signal = None
+        self.do_stop = False
+        self.timeout = timeout
+        self.block = block
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.text_queue.put(text, timeout=self.timeout)
+        if stream_end:
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while True:
+            try:
+                value = self.stop_signal  # value looks unused in pycharm, not true
+                if self.do_stop:
+                    print("hit stop", flush=True)
+                    # could raise or break, maybe best to raise and make parent see if any exception in thread
+                    self.clear_queue()
+                    self.do_stop = False
+                    raise StopIteration()
+                    # break
+                value = self.text_queue.get(block=self.block, timeout=self.timeout)
+                break
+            except queue.Empty:
+                time.sleep(0.01)
+        if value == self.stop_signal:
+            self.clear_queue()
+            self.do_stop = False
+            raise StopIteration()
+        else:
+            return value
+
+    def clear_queue(self):
+        # make sure streamer is reusable after stop hit
+        with self.text_queue.mutex:
+            self.text_queue.queue.clear()
+
+    def put(self, value):
+        """
+        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
+        # same as base class, except remove hack w.r.t. text.rfind(" ") that ruins LLaMa2
+        """
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("TextStreamer only supports batch size 1")
+        elif len(value.shape) > 1:
+            value = value[0]
+
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+
+        # Add the new token to the cache and decodes the entire thing.
+        self.token_cache.extend(value.tolist())
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+
+        # After the symbol for a new line, we flush the cache.
+        if text.endswith("\n"):
+            printable_text = text[self.print_len:]
+            self.token_cache = []
+            self.print_len = 0
+        # If the last token is a CJK character, we print the characters.
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len:]
+            self.print_len += len(printable_text)
+        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
+        # which may change with the subsequent token -- there are probably smarter ways to do this!)
+        elif len(text) > 0 and text[-1] == '�':
+            printable_text = text[self.print_len: text.rfind(" ") + 1]
+            self.print_len += len(printable_text)
+        else:
+            printable_text = text[self.print_len:]
+            self.print_len += len(printable_text)
+
+        self.on_finalized_text(printable_text)
+
+
+def generate_with_exceptions(func, *args, raise_generate_gpu_exceptions=True, **kwargs):
+    try:
+        func(*args, **kwargs)
+    except torch.cuda.OutOfMemoryError as e:
+        print("GPU OOM 2: exception: %s" % str(e),
+              flush=True)
+        if 'input_ids' in kwargs:
+            if kwargs['input_ids'] is not None:
+                kwargs['input_ids'].cpu()
+            kwargs['input_ids'] = None
+        traceback.print_exc()
+        clear_torch_cache()
+        return
+    except (Exception, RuntimeError) as e:
+        if 'Expected all tensors to be on the same device' in str(e) or \
+                'expected scalar type Half but found Float' in str(e) or \
+                'probability tensor contains either' in str(e) or \
+                'cublasLt ran into an error!' in str(e) or \
+                'mat1 and mat2 shapes cannot be multiplied' in str(e):
+            print(
+                "GPU Error: exception: %s" % str(e),
+                flush=True)
+            traceback.print_exc()
+            clear_torch_cache()
+            if raise_generate_gpu_exceptions:
+                raise
+            return
+        else:
+            clear_torch_cache()
+            if raise_generate_gpu_exceptions:
+                raise
+
+
+def get_generate_params(model_lower,
+                        chat,
+                        stream_output, show_examples,
+                        prompt_type, prompt_dict,
+                        system_prompt,
+                        pre_prompt_query, prompt_query,
+                        pre_prompt_summary, prompt_summary,
+                        temperature, top_p, top_k, num_beams,
+                        max_new_tokens, min_new_tokens, early_stopping, max_time,
+                        repetition_penalty, num_return_sequences,
+                        do_sample,
+                        top_k_docs, chunk, chunk_size,
+                        image_loaders,
+                        pdf_loaders,
+                        url_loaders,
+                        jq_schema,
+                        docs_ordering_type,
+                        min_max_new_tokens,
+                        verbose,
+                        ):
+    use_defaults = False
+    use_default_examples = True
+    examples = []
+    task_info = 'LLM'
+    if model_lower:
+        print(f"Using Model {model_lower}", flush=True)
+    else:
+        if verbose:
+            print("No model defined yet", flush=True)
+
+    min_new_tokens = min_new_tokens if min_new_tokens is not None else 0
+    early_stopping = early_stopping if early_stopping is not None else False
+    max_time_defaults = 60 * 3
+    max_time = max_time if max_time is not None else max_time_defaults
+
+    if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
+        prompt_type = inv_prompt_type_to_model_lower[model_lower]
+        if verbose:
+            print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True)
+
+    # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
+    if show_examples is None:
+        if chat:
+            show_examples = False
+        else:
+            show_examples = True
+
+    summarize_example1 = """Jeff: Can I train a ? Transformers model on Amazon SageMaker?
+Philipp: Sure you can use the new Hugging Face Deep Learning Container.
+Jeff: ok.
+Jeff: and how can I get started?
+Jeff: where can I find documentation?
+Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face"""
+
+    use_placeholder_instruction_as_example = False
+    if 'bart-large-cnn-samsum' in model_lower or 'flan-t5-base-samsum' in model_lower:
+        placeholder_instruction = summarize_example1
+        placeholder_input = ""
+        use_defaults = True
+        use_default_examples = False
+        use_placeholder_instruction_as_example = True
+        task_info = "Summarization"
+    elif 't5-' in model_lower or 't5' == model_lower or 'flan-' in model_lower:
+        placeholder_instruction = "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
+        placeholder_input = ""
+        use_defaults = True
+        use_default_examples = True
+        task_info = "Multi-Task: Q/A, translation, Chain-of-Thought, Logical Reasoning, Summarization, etc.  Best to use task prefix as trained on, e.g. `translate English to German: ` (space after colon)"
+    elif 'mbart-' in model_lower:
+        placeholder_instruction = "The girl has long hair."
+        placeholder_input = ""
+        use_defaults = True
+        use_default_examples = False
+        use_placeholder_instruction_as_example = True
+    elif 'gpt2' in model_lower:
+        placeholder_instruction = "The sky is"
+        placeholder_input = ""
+        prompt_type = prompt_type or 'plain'
+        use_default_examples = True  # some will be odd "continuations" but can be ok
+        use_placeholder_instruction_as_example = True
+        task_info = "Auto-complete phrase, code, etc."
+        use_defaults = True
+    else:
+        if chat:
+            placeholder_instruction = ""
+        else:
+            placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter."
+        placeholder_input = ""
+        if not prompt_type and model_lower in inv_prompt_type_to_model_lower and prompt_type != 'custom':
+            prompt_type = inv_prompt_type_to_model_lower[model_lower]
+        elif model_lower:
+            # default is plain, because might rely upon trust_remote_code to handle prompting
+            prompt_type = prompt_type or 'plain'
+        else:
+            prompt_type = ''
+        task_info = "No task"
+        if prompt_type == 'instruct':
+            task_info = "Answer question or follow imperative as instruction with optionally input."
+        elif prompt_type == 'plain':
+            task_info = "Auto-complete phrase, code, etc."
+        elif prompt_type == 'human_bot':
+            if chat:
+                task_info = "Chat (Shift-Enter to give question/imperative, input concatenated with instruction)"
+            else:
+                task_info = "Ask question/imperative (input concatenated with instruction)"
+
+    # revert to plain if still nothing
+    prompt_type = prompt_type or 'plain'
+    if use_defaults:
+        temperature = 1.0 if temperature is None else temperature
+        top_p = 1.0 if top_p is None else top_p
+        top_k = 40 if top_k is None else top_k
+        num_beams = num_beams or 1
+        max_new_tokens = max_new_tokens or 512
+        repetition_penalty = repetition_penalty or 1.07
+        num_return_sequences = min(num_beams, num_return_sequences or 1)
+        do_sample = False if do_sample is None else do_sample
+    else:
+        temperature = 0.1 if temperature is None else temperature
+        top_p = 0.75 if top_p is None else top_p
+        top_k = 40 if top_k is None else top_k
+        num_beams = num_beams or 1
+        max_new_tokens = max_new_tokens or 1024
+        repetition_penalty = repetition_penalty or 1.07
+        num_return_sequences = min(num_beams, num_return_sequences or 1)
+        do_sample = False if do_sample is None else do_sample
+    # doesn't include chat, instruction_nochat, iinput_nochat, added later
+    params_list = ["",
+                   stream_output,
+                   prompt_type, prompt_dict,
+                   temperature, top_p, top_k, num_beams,
+                   max_new_tokens, min_new_tokens,
+                   early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
+
+    if use_placeholder_instruction_as_example:
+        examples += [[placeholder_instruction, ''] + params_list]
+
+    if use_default_examples:
+        examples += [
+            ["Translate English to French", "Good morning"] + params_list,
+            ["Give detailed answer for whether Einstein or Newton is smarter.", ''] + params_list,
+            ["Explain in detailed list, all the best practices for coding in python.", ''] + params_list,
+            [
+                "Create a markdown table with 3 rows for the primary colors, and 2 columns, with color name and hex codes.",
+                ''] + params_list,
+            ['Translate to German:  My name is Arthur', ''] + params_list,
+            ["Please answer to the following question. Who is going to be the next Ballon d'or?", ''] + params_list,
+            ['Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.',
+             ''] + params_list,
+            ['Please answer the following question. What is the boiling point of Nitrogen?', ''] + params_list,
+            ['Answer the following yes/no question. Can you write a whole Haiku in a single tweet?', ''] + params_list,
+            ["Simplify the following expression: (False or False and True). Explain your answer.", ''] + params_list,
+            [
+                "Premise: At my age you will probably have learnt one lesson. Hypothesis:  It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?",
+                ''] + params_list,
+            ['The square root of x is the cube root of y. What is y to the power of 2, if x = 4?', ''] + params_list,
+            [
+                'Answer the following question by reasoning step by step.  The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, how many apple do they have?',
+                ''] + params_list,
+            ["""def area_of_rectangle(a: float, b: float):
+    \"\"\"Return the area of the rectangle.\"\"\"""", ''] + params_list,
+            ["""# a function in native python:
+def mean(a):
+    return sum(a)/len(a)
+
+# the same function using numpy:
+import numpy as np
+def mean(a):""", ''] + params_list,
+            ["""X = np.random.randn(100, 100)
+y = np.random.randint(0, 1, 100)
+
+# fit random forest classifier with 20 estimators""", ''] + params_list,
+        ]
+    # add summary example
+    examples += [
+        [summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else ''] + params_list]
+
+    src_lang = "English"
+    tgt_lang = "Russian"
+
+    # move to correct position
+    for example in examples:
+        example += [chat, '', '', LangChainMode.DISABLED.value, True,
+                    LangChainAction.QUERY.value, [],
+                    top_k_docs, chunk, chunk_size, DocumentSubset.Relevant.name, [],
+                    pre_prompt_query, prompt_query,
+                    pre_prompt_summary, prompt_summary,
+                    system_prompt,
+                    image_loaders,
+                    pdf_loaders,
+                    url_loaders,
+                    jq_schema,
+                    None,
+                    None,
+                    False,
+                    None,
+                    None,
+                    docs_ordering_type,
+                    min_max_new_tokens,
+                    ]
+        # adjust examples if non-chat mode
+        if not chat:
+            example[eval_func_param_names.index('instruction_nochat')] = example[
+                eval_func_param_names.index('instruction')]
+            example[eval_func_param_names.index('instruction')] = ''
+
+            example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
+            example[eval_func_param_names.index('iinput')] = ''
+        assert len(example) == len(eval_func_param_names), "Wrong example: %s %s" % (
+            len(example), len(eval_func_param_names))
+
+    if prompt_type == PromptType.custom.name and not prompt_dict:
+        raise ValueError("Unexpected to get non-empty prompt_dict=%s for prompt_type=%s" % (prompt_dict, prompt_type))
+
+    # get prompt_dict from prompt_type, so user can see in UI etc., or for custom do nothing except check format
+    prompt_dict, error0 = get_prompt(prompt_type, prompt_dict,
+                                     chat=False, context='', reduced=False, making_context=False, return_dict=True,
+                                     system_prompt=system_prompt)
+    if error0:
+        raise RuntimeError("Prompt wrong: %s" % error0)
+
+    return placeholder_instruction, placeholder_input, \
+        stream_output, show_examples, \
+        prompt_type, prompt_dict, \
+        temperature, top_p, top_k, num_beams, \
+        max_new_tokens, min_new_tokens, early_stopping, max_time, \
+        repetition_penalty, num_return_sequences, \
+        do_sample, \
+        src_lang, tgt_lang, \
+        examples, \
+        task_info
+
+
+def languages_covered():
+    # https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered
+    covered = """Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese (zh_CN), Afrikaans (af_ZA), Azerbaijani (az_AZ), Bengali (bn_IN), Persian (fa_IR), Hebrew (he_IL), Croatian (hr_HR), Indonesian (id_ID), Georgian (ka_GE), Khmer (km_KH), Macedonian (mk_MK), Malayalam (ml_IN), Mongolian (mn_MN), Marathi (mr_IN), Polish (pl_PL), Pashto (ps_AF), Portuguese (pt_XX), Swedish (sv_SE), Swahili (sw_KE), Tamil (ta_IN), Telugu (te_IN), Thai (th_TH), Tagalog (tl_XX), Ukrainian (uk_UA), Urdu (ur_PK), Xhosa (xh_ZA), Galician (gl_ES), Slovene (sl_SI)"""
+    covered = covered.split(', ')
+    covered = {x.split(' ')[0]: x.split(' ')[1].replace(')', '').replace('(', '') for x in covered}
+    return covered
+
+
+def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_len):
+    question = question[-cutoff_len:]
+    answer = answer[-cutoff_len:]
+
+    inputs = stokenizer(question, answer,
+                        return_tensors="pt",
+                        truncation=True,
+                        max_length=max_length_tokenize).to(smodel.device)
+    try:
+        score = torch.sigmoid(smodel(**inputs.to(smodel.device)).logits[0].float()).cpu().detach().numpy()[0]
+    except torch.cuda.OutOfMemoryError as e:
+        print("GPU OOM 3: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
+        del inputs
+        traceback.print_exc()
+        clear_torch_cache()
+        return 'Response Score: GPU OOM'
+    except (Exception, RuntimeError) as e:
+        if 'Expected all tensors to be on the same device' in str(e) or \
+                'expected scalar type Half but found Float' in str(e) or \
+                'probability tensor contains either' in str(e) or \
+                'cublasLt ran into an error!' in str(e) or \
+                'device-side assert triggered' in str(e):
+            print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)),
+                  flush=True)
+            traceback.print_exc()
+            clear_torch_cache()
+            return 'Response Score: GPU Error'
+        else:
+            raise
+    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+    return score
+
+
+def check_locals(**kwargs):
+    # ensure everything in evaluate is here
+    can_skip_because_locally_generated = no_default_param_names + [
+        # get_model:
+        'reward_type'
+    ]
+    for k in eval_func_param_names:
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+    for k in inputs_kwargs_list:
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+
+    for k in list(inspect.signature(get_model).parameters):
+        if k in can_skip_because_locally_generated:
+            continue
+        assert k in kwargs, "Missing %s" % k
+
+
+def get_model_max_length(model_state):
+    if not isinstance(model_state['tokenizer'], (str, type(None))):
+        return model_state['tokenizer'].model_max_length
+    else:
+        return 2048
+
+
+def get_max_max_new_tokens(model_state, **kwargs):
+    if not isinstance(model_state['tokenizer'], (str, type(None))):
+        max_max_new_tokens = model_state['tokenizer'].model_max_length
+    else:
+        max_max_new_tokens = None
+
+    if kwargs['max_max_new_tokens'] is not None and max_max_new_tokens is not None:
+        return min(max_max_new_tokens, kwargs['max_max_new_tokens'])
+    elif kwargs['max_max_new_tokens'] is not None:
+        return kwargs['max_max_new_tokens']
+    elif kwargs['memory_restriction_level'] == 1:
+        return 768
+    elif kwargs['memory_restriction_level'] == 2:
+        return 512
+    elif kwargs['memory_restriction_level'] >= 3:
+        return 256
+    else:
+        # FIXME: Need to update after new model loaded, so user can control with slider
+        return 2048
+
+
+def get_minmax_top_k_docs(is_public):
+    if is_public:
+        min_top_k_docs = 1
+        max_top_k_docs = 8
+        label_top_k_docs = "Number of document chunks"
+    else:
+        min_top_k_docs = -1
+        max_top_k_docs = 100
+        label_top_k_docs = "Number of document chunks (-1 = auto fill model context)"
+    return min_top_k_docs, max_top_k_docs, label_top_k_docs
+
+
+def merge_chat_conversation_history(chat_conversation1, history):
+    # chat_conversation and history ordered so largest index of list is most recent
+    if chat_conversation1:
+        chat_conversation1 = str_to_list(chat_conversation1)
+        for conv1 in chat_conversation1:
+            assert isinstance(conv1, (list, tuple))
+            assert len(conv1) == 2
+
+    if isinstance(history, list):
+        # make copy so only local change
+        if chat_conversation1:
+            # so priority will be newest that comes from actual chat history from UI, then chat_conversation
+            history = chat_conversation1 + history.copy()
+    elif chat_conversation1:
+        history = chat_conversation1
+    else:
+        history = []
+    return history
+
+
+def history_to_context(history, langchain_mode=None,
+                       add_chat_history_to_context=None,
+                       prompt_type=None, prompt_dict=None, chat=None, model_max_length=None,
+                       memory_restriction_level=None, keep_sources_in_context=None,
+                       system_prompt=None, chat_conversation=None):
+    """
+    consumes all history up to (but not including) latest history item that is presumed to be an [instruction, None] pair
+    :param history:
+    :param langchain_mode:
+    :param add_chat_history_to_context:
+    :param prompt_type:
+    :param prompt_dict:
+    :param chat:
+    :param model_max_length:
+    :param memory_restriction_level:
+    :param keep_sources_in_context:
+    :param system_prompt:
+    :param chat_conversation:
+    :return:
+    """
+    history = merge_chat_conversation_history(chat_conversation, history)
+
+    if len(history) >= 1 and len(history[-1]) >= 2 and not history[-1][1]:
+        len_history = len(history) - 1
+    else:
+        # full history
+        len_history = len(history)
+
+    # ensure output will be unique to models
+    _, _, _, max_prompt_length = get_cutoffs(memory_restriction_level,
+                                             for_context=True, model_max_length=model_max_length)
+    context1 = ''
+    if max_prompt_length is not None and add_chat_history_to_context:
+        context1 = ''
+        # - 1 below because current instruction already in history from user()
+        for histi in range(0, len_history):
+            data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
+            prompt, pre_response, terminate_response, chat_sep, chat_turn_sep = \
+                generate_prompt(data_point,
+                                prompt_type,
+                                prompt_dict,
+                                chat,
+                                reduced=True,
+                                making_context=True,
+                                system_prompt=system_prompt,
+                                histi=histi)
+            # md -> back to text, maybe not super important if model trained enough
+            if not keep_sources_in_context and langchain_mode != 'Disabled' and prompt.find(super_source_prefix) >= 0:
+                # FIXME: This is relatively slow even for small amount of text, like 0.3s each history item
+                import re
+                prompt = re.sub(f'{re.escape(super_source_prefix)}.*?{re.escape(super_source_postfix)}', '', prompt,
+                                flags=re.DOTALL)
+                if prompt.endswith('\n<p>'):
+                    prompt = prompt[:-4]
+            prompt = prompt.replace('<br>', chat_turn_sep)
+            if not prompt.endswith(chat_turn_sep):
+                prompt += chat_turn_sep
+            # most recent first, add older if can
+            # only include desired chat history
+            if len(prompt + context1) > max_prompt_length:
+                break
+            context1 += prompt
+
+        _, pre_response, terminate_response, chat_sep, chat_turn_sep = \
+            generate_prompt({}, prompt_type, prompt_dict,
+                            chat, reduced=True,
+                            making_context=True,
+                            system_prompt=system_prompt,
+                            histi=-1)
+        if context1 and not context1.endswith(chat_turn_sep):
+            context1 += chat_turn_sep  # ensure if terminates abruptly, then human continues on next line
+    return context1
+
+
+def get_limited_prompt(instruction,
+                       iinput,
+                       tokenizer,
+                       prompter=None,
+                       inference_server=None,
+                       prompt_type=None, prompt_dict=None, chat=False, max_new_tokens=None,
+                       system_prompt='',
+                       context='', chat_conversation=None, text_context_list=None,
+                       keep_sources_in_context=False,
+                       model_max_length=None, memory_restriction_level=0,
+                       langchain_mode=None, add_chat_history_to_context=True,
+                       verbose=False,
+                       doc_importance=0.5,
+                       min_max_new_tokens=256,
+                       ):
+    if prompter:
+        prompt_type = prompter.prompt_type
+        prompt_dict = prompter.prompt_dict
+        chat = prompter.chat
+        stream_output = prompter.stream_output
+        system_prompt = prompter.system_prompt
+
+    # merge handles if chat_conversation is None
+    history = []
+    history = merge_chat_conversation_history(chat_conversation, history)
+    history_to_context_func = functools.partial(history_to_context,
+                                                langchain_mode=langchain_mode,
+                                                add_chat_history_to_context=add_chat_history_to_context,
+                                                prompt_type=prompt_type,
+                                                prompt_dict=prompt_dict,
+                                                chat=chat,
+                                                model_max_length=model_max_length,
+                                                memory_restriction_level=memory_restriction_level,
+                                                keep_sources_in_context=keep_sources_in_context,
+                                                system_prompt=system_prompt)
+    context2 = history_to_context_func(history)
+    context1 = context
+    if context1 is None:
+        context1 = ''
+
+    from h2oai_pipeline import H2OTextGenerationPipeline
+    data_point_just_instruction = dict(context='', instruction=instruction, input='')
+    prompt_just_instruction = prompter.generate_prompt(data_point_just_instruction)
+    instruction, num_instruction_tokens = H2OTextGenerationPipeline.limit_prompt(instruction, tokenizer)
+    num_instruction_tokens_real = get_token_count(prompt_just_instruction, tokenizer)
+    num_instruction_tokens += (num_instruction_tokens_real - num_instruction_tokens)
+
+    context1, num_context1_tokens = H2OTextGenerationPipeline.limit_prompt(context1, tokenizer)
+    context2, num_context2_tokens = H2OTextGenerationPipeline.limit_prompt(context2, tokenizer)
+    iinput, num_iinput_tokens = H2OTextGenerationPipeline.limit_prompt(iinput, tokenizer)
+    if text_context_list is None:
+        text_context_list = []
+    num_doc_tokens = sum([get_token_count(x + '\n\n', tokenizer) for x in text_context_list])
+
+    num_prompt_tokens0 = (num_instruction_tokens or 0) + \
+                         (num_context1_tokens or 0) + \
+                         (num_context2_tokens or 0) + \
+                         (num_iinput_tokens or 0) + \
+                         (num_doc_tokens or 0)
+
+    # go down to no less than 256, about 1 paragraph
+    # use max_new_tokens before use num_prompt_tokens0 else would be negative or ~0
+    min_max_new_tokens = min(min_max_new_tokens, max_new_tokens)
+    # by default assume can handle all chat and docs
+    chat_index = 0
+
+    # allowed residual is either half of what is allowed if doc exceeds half, or is rest of what doc didn't consume
+    num_non_doc_tokens = num_prompt_tokens0 - num_doc_tokens
+    # to doc first then non-doc, shouldn't matter much either way
+    doc_max_length = max(model_max_length - num_non_doc_tokens, doc_importance * model_max_length)
+    top_k_docs, one_doc_size, num_doc_tokens = get_docs_tokens(tokenizer, text_context_list=text_context_list,
+                                                               max_input_tokens=doc_max_length)
+    non_doc_max_length = max(model_max_length - num_doc_tokens, (1.0 - doc_importance) * model_max_length)
+
+    if num_non_doc_tokens > non_doc_max_length:
+        # need to limit in some way, keep portion of history but all of context and instruction
+        # 1) drop iinput (unusual to include anyways)
+        # 2) reduce history
+        # 3) reduce context1
+        # 4) limit instruction so will fit
+        diff1 = non_doc_max_length - (
+                num_instruction_tokens + num_context1_tokens + num_context2_tokens + min_max_new_tokens)
+        diff2 = non_doc_max_length - (num_instruction_tokens + num_context1_tokens + min_max_new_tokens)
+        diff3 = non_doc_max_length - (num_instruction_tokens + min_max_new_tokens)
+        diff4 = non_doc_max_length - min_max_new_tokens
+        if diff1 > 0:
+            # then should be able to do #1
+            iinput = ''
+            num_iinput_tokens = 0
+        elif diff2 > 0 > diff1:
+            # then may be able to do #1 + #2
+            iinput = ''
+            num_iinput_tokens = 0
+            chat_index_final = len(history)
+            for chat_index in range(len(history)):
+                # NOTE: history and chat_conversation are older for first entries
+                # FIXME: This is a slow for many short conversations
+                context2 = history_to_context_func(history[chat_index:])
+                num_context2_tokens = get_token_count(context2, tokenizer)
+                diff1 = non_doc_max_length - (
+                        num_instruction_tokens + num_context1_tokens + num_context2_tokens + min_max_new_tokens)
+                if diff1 > 0:
+                    chat_index_final = chat_index
+                    if verbose:
+                        print("chat_conversation used %d out of %d" % (chat_index, len(history)), flush=True)
+                    break
+            chat_index = chat_index_final  # i.e. if chat_index == len(history), then nothing can be consumed
+        elif diff3 > 0 > diff2:
+            # then may be able to do #1 + #2 + #3
+            iinput = ''
+            num_iinput_tokens = 0
+            context2 = ''
+            num_context2_tokens = 0
+            context1, num_context1_tokens = H2OTextGenerationPipeline.limit_prompt(context1, tokenizer,
+                                                                                   max_prompt_length=diff3)
+            if num_context1_tokens <= diff3:
+                pass
+            else:
+                print("failed to reduce", flush=True)
+        else:
+            # then must be able to do #1 + #2 + #3 + #4
+            iinput = ''
+            num_iinput_tokens = 0
+            context2 = ''
+            num_context2_tokens = 0
+            context1 = ''
+            num_context1_tokens = 0
+            # diff4 accounts for real prompting for instruction
+            # FIXME: history_to_context could include instruction, in case system prompt long, we overcount and could have more free tokens
+            instruction, num_instruction_tokens = H2OTextGenerationPipeline.limit_prompt(instruction, tokenizer,
+                                                                                         max_prompt_length=diff4)
+            # get actual tokens
+            data_point_just_instruction = dict(context='', instruction=instruction, input='')
+            prompt_just_instruction = prompter.generate_prompt(data_point_just_instruction)
+            num_instruction_tokens_real = get_token_count(prompt_just_instruction, tokenizer)
+            num_instruction_tokens += (num_instruction_tokens_real - num_instruction_tokens)
+
+    # update full context
+    context = context1 + context2
+    # update token counts (docs + non-docs, all tokens)
+    num_prompt_tokens = (num_instruction_tokens or 0) + \
+                        (num_context1_tokens or 0) + \
+                        (num_context2_tokens or 0) + \
+                        (num_iinput_tokens or 0) + \
+                        (num_doc_tokens or 0)
+
+    # update max_new_tokens
+    if inference_server and inference_server.startswith('http'):
+        # assume TGI/Gradio setup to consume tokens and have long output too, even if exceeds model capacity.
+        pass
+    else:
+        # limit so max_new_tokens = prompt + new < max
+        # otherwise model can fail etc. e.g. for distilgpt2 asking for 1024 tokens is enough to fail if prompt=1 token
+        max_new_tokens = min(max_new_tokens, model_max_length - num_prompt_tokens)
+
+    if prompter is None:
+        # get prompter
+        debug = False
+        stream_output = False  # doesn't matter
+        prompter = Prompter(prompt_type, prompt_dict, debug=debug, chat=chat, stream_output=stream_output,
+                            system_prompt=system_prompt)
+
+    data_point = dict(context=context, instruction=instruction, input=iinput)
+    # handle promptA/promptB addition if really from history.
+    # if not from history, then reduced=False inside correct
+    # if mixed, then no specific correct thing to do, so treat like history and promptA/B will come first still
+    context_from_history = len(history) > 0 and len(context1) > 0
+    prompt = prompter.generate_prompt(data_point, context_from_history=context_from_history)
+    num_prompt_tokens_actual = get_token_count(prompt, tokenizer)
+
+    return prompt, \
+        instruction, iinput, context, \
+        num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \
+        chat_index, top_k_docs, one_doc_size
+
+
+def get_docs_tokens(tokenizer, text_context_list=[], max_input_tokens=None):
+    if text_context_list is None or len(text_context_list) == 0:
+        return 0, None, 0
+    if max_input_tokens is None:
+        max_input_tokens = tokenizer.model_max_length
+    tokens = [get_token_count(x + '\n\n', tokenizer) for x in text_context_list]
+    tokens_cumsum = np.cumsum(tokens)
+    where_res = np.where(tokens_cumsum < max_input_tokens)[0]
+    # if below condition fails, then keep top_k_docs=-1 and trigger special handling next
+    if where_res.shape[0] > 0:
+        top_k_docs = 1 + where_res[-1]
+        one_doc_size = None
+        num_doc_tokens = tokens_cumsum[top_k_docs - 1]  # by index
+    else:
+        # if here, means 0 and just do best with 1 doc
+        top_k_docs = 1
+        text_context_list = text_context_list[:top_k_docs]
+        # critical protection
+        from src.h2oai_pipeline import H2OTextGenerationPipeline
+        doc_content = text_context_list[0]
+        doc_content, new_tokens0 = H2OTextGenerationPipeline.limit_prompt(doc_content,
+                                                                          tokenizer,
+                                                                          max_prompt_length=max_input_tokens)
+        text_context_list[0] = doc_content
+        one_doc_size = len(doc_content)
+        num_doc_tokens = get_token_count(doc_content + '\n\n', tokenizer)
+        print("Unexpected large chunks and can't add to context, will add 1 anyways.  Tokens %s -> %s" % (
+            tokens[0], new_tokens0), flush=True)
+    return top_k_docs, one_doc_size, num_doc_tokens
+
+
+def entrypoint_main():
+    """
+    Examples:
+
+    WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
+    python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
+    python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
+
+    # generate without lora weights, no prompt
+    python generate.py --base_model='EleutherAI/gpt-neox-20b' --prompt_type='plain'
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq'
+
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq' --lora_weights='lora_20B_daifaq'
+    # OpenChatKit settings:
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0
+
+    python generate.py --base_model='distilgpt2' --prompt_type='plain' --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0 --share=False
+    python generate.py --base_model='t5-large' --prompt_type='simple_instruct'
+    python generate.py --base_model='philschmid/bart-large-cnn-samsum'
+    python generate.py --base_model='philschmid/flan-t5-base-samsum'
+    python generate.py --base_model='facebook/mbart-large-50-many-to-many-mmt'
+
+    python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28'
+
+    must have 4*48GB GPU and run without 8bit in order for sharding to work with use_gpu_id=False
+    can also pass --prompt_type='human_bot' and model can somewhat handle instructions without being instruct tuned
+    python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --use_gpu_id=False --prompt_type='human_bot'
+
+    python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b
+    """
+    H2O_Fire(main)
+
+
+if __name__ == "__main__":
+    entrypoint_main()