Spaces:

Symato
/

tomtat

Sleeping

App Files Files Community

tomtat / llm.py

tiendung

new files

cec26ce 9 months ago

raw

history blame

10.4 kB

	#!/usr/bin/env python3
	import utils; from utils import *
	import os, sys, lzma, json, pprint, time, subprocess

	thinker = os.getenv("thinker", "gemini")
	TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax)

	LLM_HOST = "gemini"
	TKNZ_RATIO = 1

	GEMINI_MODEL = 'gemini-1.5-pro-002'
	FLASH_MODEL = 'gemini-1.5-flash-002'

	# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Prompting.ipynb
	# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Streaming.ipynb
	import google.generativeai as genai # pip install -U -q google-generativeai
	llm_log_filename = f"{location__}/data/llm.log"


	genai.configure(api_key=os.getenv("GEMINI_FLASH_API_KEY"))

	GEMINI_CLIENT = genai.GenerativeModel(GEMINI_MODEL, \
	generation_config=genai.GenerationConfig(
	max_output_tokens=1024*4,
	temperature=TEMPERATURE
	))

	def chat(prompt, history=[], use_cache=False, stream=False):
	if stream: return GEMINI_CLIENT.generate_content(prompt, stream=True)

	messages = history + [{"role": "user", "content": prompt}] # fake history
	with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {GEMINI_MODEL} ] - - -\n\nPROMPT:\n{prompt}\n")

	try:
	res = GEMINI_CLIENT.generate_content(prompt, request_options = { "timeout": 6000 })
	with open(llm_log_filename,"at") as f: f.write(f"\nRESPONSE:\n{res}\n"); f.write(f"\nCONTENT:\n{res.text}\n")
	messages += [{"role": "assistant", "content": res.text}]
	return messages

	except Exception as e:
	with open(llm_log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n")
	print(f"\nEXCEPTION:\n{e}\n"); raise e


	FLASH_CLIENT = genai.GenerativeModel(FLASH_MODEL, \
	generation_config=genai.GenerationConfig(
	max_output_tokens=1024*8,
	temperature=TEMPERATURE
	))

	# def flash_chat(prompt, history=[], use_cache=False, stream=False):
	# res = FLASH_CLIENT.generate_content(prompt)
	# return [{"role": "assistant", "content": res.text}]
	flash_chat = chat

	def who_are_you():
	print(f"{RED}{LLM_HOST}{RESET} " * 2)


	if thinker == "gemini": # gemini pro
	CTXLEN = 1024*64 # gemini thì vô tư, 128k hoặc 1m ctxlen đều OK
	thinker_chat = chat

	elif thinker in "70b\|405b":
	cache_filename = f"{location__}/data/thinker.jsonl.xz"
	lock_filename = f"{location__}/data/thinker.lock"
	log_filename = f"{location__}/data/thinker.log"

	## Load thinker_cache
	lines = [] if not os.path.exists(cache_filename) else \
	[ line for line in lzma.open(cache_filename,"rt") ]
	assert len(lines) % 2 == 0
	thinker_cache = {}; i = 0
	while i < len(lines): # line có \n ở cuối nên [:-1] để bỏ đi
	thinker_cache[lines[i][:-1]] = json.loads(lines[i+1])
	i += 2
	lines = None # Done loading

	# https://docs.together.ai/docs/chat-models#hosted-models
	model = {
	"405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo 8k 3k 1.2", # $5.00 / 1m tokens(*)
	"70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo 128k 4k 1.2", # $0.88 / 1m tokens(*)
	}[thinker]

	model, CTXLEN, MAX_TOKENS, TKNZ_RATIO = model.strip().split()
	LLM_HOST = model

	MAX_TOKENS = int(MAX_TOKENS[:-1])*1024
	TKNZ_RATIO = float(TKNZ_RATIO)

	CTXLEN = int(CTXLEN[:-1])
	if CTXLEN > 32: CTXLEN = 32 # max 32k ctxlen
	CTXLEN = CTXLEN*1024 - MAX_TOKENS
	# print(model, CTXLEN, MAX_TOKENS, TKNZ_RATIO); input(); # DEBUG

	from together import Together
	together_client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))
	###
	stops = ["<\|eot_id\|>","<\|eom_id\|>","</answer>","</output>"]
	def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False):
	if stream:
	with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
	return together_client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	top_p=0.7, top_k=50,
	repetition_penalty=1.2, stop=stops,
	stream=True
	)

	messages = history + [{"role": "user", "content": prompt}]
	messages_jsonl = json.dumps(messages, ensure_ascii=False)
	cache_found = (messages_jsonl in thinker_cache)

	if use_cache and cache_found:
	print(f"{YELLOW}<<< cached content >>>{RESET}")
	content = thinker_cache[messages_jsonl]

	elif testing:
	print(f"{RED}<<< testing content >>>{RESET}")
	content = "testing testing"

	else:
	print(f"{GREEN}<<< fresh content >>>{RESET}")
	with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
	try:
	response = Together(api_key=os.environ.get('TOGETHER_API_KEY')).chat.completions.create(
	model=model,
	messages=messages,
	max_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	top_p=0.7, top_k=50,
	repetition_penalty=1.2, stop=stops,
	logprobs=1, stream=False
	)
	except Exception as e:
	with open(log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n")
	print(f"\nEXCEPTION:\n{e}\n"); raise e

	content = response.choices[0].message.content
	with open(log_filename,"at") as f:
	f.write(f"\nRESPONSE:\n{response}\n")
	f.write(f"\nCONTENT:\n{content}\n")

	thinker_cache[messages_jsonl] = content # update new generated content

	waits = 5
	while waits > 0 and os.path.exists(lock_filename): # có người đang write, wait
	waits -= 1
	time.sleep(0.2)

	if waits == 0:
	assert False, f"Bị lock hơn 1 second, có thể xóa {lock_filename} nếu lỗi này lặp lại"

	subprocess.run(f"touch {lock_filename}", shell=True) # lock
	with lzma.open(cache_filename,"at") as f: # write
	f.write(f"{messages_jsonl}\n{json.dumps(content, ensure_ascii=False)}\n")
	subprocess.run(f"rm {lock_filename}", shell=True) # unlock

	messages += [{"role": "assistant", "content": content}]
	return messages


	elif thinker in "gemma2:27b\|commandr:35b\|llama3.1:70b":
	#################
	## Ollama connect
	import subprocess, ollama # pip install ollama
	try: ollama.list()
	except: subprocess.run('nohup ssh -N -L 11434:localhost:11434 -p 22021 [email protected] &', shell=True)
	subprocess.run('nohup ssh -N -L 9999:localhost:11434 -p 17340 [email protected] &', shell=True)
	#################
	OLLAMA_CLIENT = ollama.Client(host='http://localhost:11434')
	machine = "RTX-4090-24G"

	## ~30b models
	if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q5_K_M" ; CTXLEN = 512*14 # fit 24G
	elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q4_K_M" ; CTXLEN = 512*18 # fit 24G
	else: OLLAMA_MODEL = "not found"

	try: connect_to_4090 = OLLAMA_MODEL in str(ollama.list())
	except: connect_to_4090 = False

	if not connect_to_4090: # switch to A100
	OLLAMA_CLIENT = ollama.Client(host='http://localhost:9999')
	machine = "A100-PCIE-40GB"
	## ~30b to ~70b models
	if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q8_0" ; CTXLEN = 1024*24
	elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q8_0" ; CTXLEN = 1024*32
	elif thinker in "llama3.1:70b": OLLAMA_MODEL = "llama3.1:70b-instruct-q3_K_M" ; CTXLEN = 1024*12#fit 40G
	LLM_HOST = f"{machine}__{OLLAMA_MODEL}"

	def thinker_chat(prompt, history=[], stream=False, use_cache=False):
	if stream:
	with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
	return OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=[{"role": "user", "content": prompt}], \
	stream=True, options={'num_ctx': CTXLEN, 'temperature': TEMPERATURE})

	messages = history + [{"role": "user", "content": prompt}]
	with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
	res = OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=messages, options={'temperature': TEMPERATURE})
	content = res["message"]["content"]
	with open(llm_log_filename,"at") as f: f.write(f"\nCONTENT:\n{content}\n")
	messages += [{"role": "assistant", "content": content}]
	return messages

	## To make it's 100% local llm, normal chat can also use thinker
	# chat = thinker_chat

	LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen"
	who_are_you()



	from prompts import summary_template
	from prompts import contextual_template, clean_view_template

	USE_CACHE = os.getenv("cache", "1") == "1"


	def extract_keyphrases_figures_summary(text):
	if len(text) < 80: return ""

	prompt = summary_template.format(text = text)
	print(f"{GREEN}{text}{RESET}")

	utils.reset_timer(timer = "extract_keyphrases_figures_summary")
	res = chat(prompt, use_cache = USE_CACHE)
	utils.measure_time("", timer = "extract_keyphrases_figures_summary")

	raw = res[-1]["content"]
	print(f"{MAGENTA}{raw}{RESET}")

	return raw


	def gen_contextual(document, chunk):
	prompt = contextual_template.format(document = document, chunk = chunk)
	res = thinker_chat(prompt, use_cache = USE_CACHE)
	contextual = res[-1]["content"].strip()
	return contextual


	def gen_clean_view(document):
	prompt = clean_view_template.format(document = document)
	res = chat(prompt, use_cache = USE_CACHE)
	ret = res[-1]["content"].strip()
	return ret


	if __name__ == "__main__":

	try: filename = sys.argv[1]
	except: filename = None
	if filename: q = open(filename, "rt").read()
	else: q = "What's your name? Who created you?"

	utils.reset_timer(); res = thinker_chat(q, use_cache=False)
	utils.measure_time(LLM_HOST + " ")
	print(f"{CYAN}{q}{RESET}", end="\n\n"); print(res[-1]["content"])