aiben / src /prepare_offline.py
abugaber's picture
Upload folder using huggingface_hub
3943768 verified
def noop_load(*args, **kwargs):
return None
def go_prepare_offline(*args, **kwargs):
kwargs0 = kwargs['kwargs']
# gen.py steps should have already obtained:
# model+tokenizers from base_model or model_lock if required
# tokenizers, including tokenizers for model_lock if using inference servers even if no LLM locally
# score_model or reward model
#
# Additional steps are related to document Q/A:
# For simplicity use gradio functions,
# but not API calls that would require actual gradio app up and API usage that might have issues
kwargs['max_quality'] = True
embed = True
h2ogpt_key = ''
file_list = ['tests/driverslicense.jpeg', 'tests/CityofTshwaneWater.pdf', 'tests/example.xlsx']
inputs2 = [kwargs['my_db_state0'],
kwargs['selection_docs_state0'],
kwargs['requests_state0'],
kwargs0['langchain_mode'],
kwargs0['chunk'],
kwargs0['chunk_size'],
embed,
kwargs['image_audio_loaders_options'],
kwargs['pdf_loaders_options'],
kwargs['url_loaders_options'],
kwargs['jq_schema0'],
kwargs['extract_frames'],
kwargs['llava_prompt'],
h2ogpt_key,
]
for fileup_output in file_list:
# ensure normal blip (not 2) obtained
blip2 = 'CaptionLarge'
if blip2 in kwargs['image_audio_loaders_options']:
kwargs['image_audio_loaders_options'].remove(blip2)
# ensure normal asr (not asrlarge) obtained
asrlarge = 'ASRLarge'
if asrlarge in kwargs['image_audio_loaders_options']:
kwargs['image_audio_loaders_options'].remove(asrlarge)
inputs1 = [fileup_output]
add_file_kwargs = dict(fn=kwargs['update_db_func'],
inputs=inputs1 + inputs2)
add_file_kwargs['fn'](*tuple(add_file_kwargs['inputs']))
inputs2[8] = kwargs['image_audio_loaders_options']
add_file_kwargs = dict(fn=kwargs['update_db_func'],
inputs=inputs1 + inputs2)
add_file_kwargs['fn'](*tuple(add_file_kwargs['inputs']))
# FakeTokenizer etc. needs tiktoken for general tasks
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
assert encoding
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
assert encoding
# sometimes summarization needs gpt2 still
from transformers import AutoTokenizer
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
assert tokenizer
# then run h2ogpt as:
# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False ...