Spaces:

Symato
/

tomtat

Sleeping

App Files Files Community

tiendung commited on Sep 29, 2024

Commit

cec26ce

1 Parent(s): 675b3d2

new files

Browse files

Files changed (3) hide show

llm.py +261 -0
text_utils.py +79 -0
utils.py +44 -0

llm.py ADDED Viewed

	@@ -0,0 +1,261 @@

+#!/usr/bin/env python3
+import utils; from utils import *
+import os, sys, lzma, json, pprint, time, subprocess
+thinker = os.getenv("thinker", "gemini")
+TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax)
+LLM_HOST = "gemini"
+TKNZ_RATIO = 1
+GEMINI_MODEL = 'gemini-1.5-pro-002'
+FLASH_MODEL = 'gemini-1.5-flash-002'
+# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Prompting.ipynb
+# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Streaming.ipynb
+import google.generativeai as genai # pip install -U -q google-generativeai
+llm_log_filename = f"{location__}/data/llm.log"
+genai.configure(api_key=os.getenv("GEMINI_FLASH_API_KEY"))
+GEMINI_CLIENT = genai.GenerativeModel(GEMINI_MODEL, \
+    generation_config=genai.GenerationConfig(
+        max_output_tokens=1024*4,
+        temperature=TEMPERATURE
+    ))
+def chat(prompt, history=[], use_cache=False, stream=False):
+    if stream: return GEMINI_CLIENT.generate_content(prompt, stream=True)
+    messages = history + [{"role": "user", "content": prompt}] # fake history
+    with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {GEMINI_MODEL} ] - - -\n\nPROMPT:\n{prompt}\n")
+    try:
+        res = GEMINI_CLIENT.generate_content(prompt, request_options = { "timeout": 6000 })
+        with open(llm_log_filename,"at") as f: f.write(f"\nRESPONSE:\n{res}\n"); f.write(f"\nCONTENT:\n{res.text}\n")
+        messages += [{"role": "assistant", "content": res.text}]
+        return messages
+    except Exception as e:
+        with open(llm_log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n")
+        print(f"\nEXCEPTION:\n{e}\n"); raise e
+FLASH_CLIENT = genai.GenerativeModel(FLASH_MODEL, \
+    generation_config=genai.GenerationConfig(
+        max_output_tokens=1024*8,
+        temperature=TEMPERATURE
+    ))
+# def flash_chat(prompt, history=[], use_cache=False, stream=False):
+#         res = FLASH_CLIENT.generate_content(prompt)
+#         return [{"role": "assistant", "content": res.text}]
+flash_chat = chat
+def who_are_you():
+    print(f"{RED}{LLM_HOST}{RESET}  " * 2)
+if thinker == "gemini": # gemini pro
+    CTXLEN = 1024*64 # gemini thì vô tư, 128k hoặc 1m ctxlen đều OK
+    thinker_chat = chat
+elif thinker in "70b|405b":
+    cache_filename = f"{location__}/data/thinker.jsonl.xz"
+    lock_filename = f"{location__}/data/thinker.lock"
+    log_filename = f"{location__}/data/thinker.log"
+    ## Load thinker_cache
+    lines = [] if not os.path.exists(cache_filename) else \
+        [ line for line in lzma.open(cache_filename,"rt") ]
+    assert len(lines) % 2 == 0
+    thinker_cache = {}; i = 0
+    while i < len(lines): # line có \n ở cuối nên [:-1] để bỏ đi
+        thinker_cache[lines[i][:-1]] = json.loads(lines[i+1])
+        i += 2
+    lines = None # Done loading
+    # https://docs.together.ai/docs/chat-models#hosted-models
+    model = {
+        "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo    8k 3k 1.2", # $5.00 / 1m tokens(*)
+         "70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo   128k 4k 1.2", # $0.88 / 1m tokens(*)
+    }[thinker]
+    model, CTXLEN, MAX_TOKENS, TKNZ_RATIO = model.strip().split()
+    LLM_HOST = model
+    MAX_TOKENS = int(MAX_TOKENS[:-1])*1024
+    TKNZ_RATIO = float(TKNZ_RATIO)
+    CTXLEN = int(CTXLEN[:-1])
+    if CTXLEN > 32: CTXLEN = 32 # max 32k ctxlen
+    CTXLEN = CTXLEN*1024 - MAX_TOKENS
+    # print(model, CTXLEN, MAX_TOKENS, TKNZ_RATIO); input(); # DEBUG
+    from together import Together
+    together_client = Together(api_key=os.environ.get('TOGETHER_API_KEY'))
+    ###
+    stops = ["<|eot_id|>","<|eom_id|>","</answer>","</output>"]
+    def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False):
+        if stream:
+            with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
+            return together_client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=MAX_TOKENS,
+                temperature=TEMPERATURE,
+                top_p=0.7, top_k=50,
+                repetition_penalty=1.2, stop=stops,
+                stream=True
+            )
+        messages = history + [{"role": "user", "content": prompt}]
+        messages_jsonl = json.dumps(messages, ensure_ascii=False)
+        cache_found = (messages_jsonl in thinker_cache)
+        if use_cache and cache_found:
+            print(f"{YELLOW}<<< cached content >>>{RESET}")
+            content = thinker_cache[messages_jsonl]
+        elif testing:
+            print(f"{RED}<<< testing content >>>{RESET}")
+            content = "testing testing"
+        else:
+            print(f"{GREEN}<<< fresh content >>>{RESET}")
+            with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
+            try:
+                response = Together(api_key=os.environ.get('TOGETHER_API_KEY')).chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    max_tokens=MAX_TOKENS,
+                    temperature=TEMPERATURE,
+                    top_p=0.7, top_k=50,
+                    repetition_penalty=1.2, stop=stops,
+                    logprobs=1, stream=False
+                )
+            except Exception as e:
+                with open(log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n")
+                print(f"\nEXCEPTION:\n{e}\n"); raise e
+            content = response.choices[0].message.content
+            with open(log_filename,"at") as f:
+                f.write(f"\nRESPONSE:\n{response}\n")
+                f.write(f"\nCONTENT:\n{content}\n")
+            thinker_cache[messages_jsonl] = content # update new generated content
+            waits = 5
+            while waits > 0 and os.path.exists(lock_filename): # có người đang write, wait
+                waits -= 1
+                time.sleep(0.2)
+            if waits == 0:
+                assert False, f"Bị lock hơn 1 second, có thể xóa {lock_filename} nếu lỗi này lặp lại"
+            subprocess.run(f"touch {lock_filename}", shell=True) # lock
+            with lzma.open(cache_filename,"at") as f: # write
+                f.write(f"{messages_jsonl}\n{json.dumps(content, ensure_ascii=False)}\n")
+            subprocess.run(f"rm {lock_filename}", shell=True) # unlock
+        messages += [{"role": "assistant", "content": content}]
+        return messages
+elif thinker in "gemma2:27b|commandr:35b|llama3.1:70b":
+    #################
+    ## Ollama connect
+    import subprocess, ollama # pip install ollama
+    try: ollama.list()
+    except: subprocess.run('nohup ssh -N -L 11434:localhost:11434 -p 22021 [email protected] &', shell=True)
+    subprocess.run('nohup ssh -N -L 9999:localhost:11434 -p 17340 [email protected] &', shell=True)
+    #################
+    OLLAMA_CLIENT = ollama.Client(host='http://localhost:11434')
+    machine = "RTX-4090-24G"
+    ## ~30b models
+    if   thinker in    "gemma2:27b": OLLAMA_MODEL =     "gemma2:27b-instruct-q5_K_M" ; CTXLEN = 512*14 # fit 24G
+    elif thinker in  "commandr:35b": OLLAMA_MODEL =  "command-r:35b-08-2024-q4_K_M"  ; CTXLEN = 512*18 # fit 24G
+    else: OLLAMA_MODEL = "not found"
+    try:    connect_to_4090 = OLLAMA_MODEL in str(ollama.list())
+    except: connect_to_4090 = False
+    if not connect_to_4090: # switch to A100
+        OLLAMA_CLIENT = ollama.Client(host='http://localhost:9999')
+        machine = "A100-PCIE-40GB"
+        ## ~30b to ~70b models
+        if   thinker in   "gemma2:27b": OLLAMA_MODEL =    "gemma2:27b-instruct-q8_0"   ; CTXLEN = 1024*24
+        elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q8_0"    ; CTXLEN = 1024*32
+        elif thinker in "llama3.1:70b": OLLAMA_MODEL =  "llama3.1:70b-instruct-q3_K_M" ; CTXLEN = 1024*12#fit 40G
+    LLM_HOST = f"{machine}__{OLLAMA_MODEL}"
+    def thinker_chat(prompt, history=[],  stream=False, use_cache=False):
+        if stream:
+            with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
+            return OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=[{"role": "user", "content": prompt}], \
+                stream=True, options={'num_ctx': CTXLEN, 'temperature': TEMPERATURE})
+        messages = history + [{"role": "user", "content": prompt}]
+        with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n")
+        res = OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=messages, options={'temperature': TEMPERATURE})
+        content = res["message"]["content"]
+        with open(llm_log_filename,"at") as f: f.write(f"\nCONTENT:\n{content}\n")
+        messages += [{"role": "assistant", "content": content}]
+        return messages
+    ## To make it's 100% local llm, normal chat can also use thinker
+    # chat = thinker_chat
+LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen"
+who_are_you()
+from prompts import summary_template
+from prompts import contextual_template, clean_view_template
+USE_CACHE = os.getenv("cache", "1") == "1"
+def extract_keyphrases_figures_summary(text):
+    if len(text) < 80: return ""
+    prompt = summary_template.format(text = text)
+    print(f"{GREEN}{text}{RESET}")
+    utils.reset_timer(timer = "extract_keyphrases_figures_summary")
+    res = chat(prompt, use_cache = USE_CACHE)
+    utils.measure_time("", timer = "extract_keyphrases_figures_summary")
+    raw = res[-1]["content"]
+    print(f"{MAGENTA}{raw}{RESET}")
+    return raw
+def gen_contextual(document, chunk):
+    prompt = contextual_template.format(document = document, chunk = chunk)
+    res = thinker_chat(prompt, use_cache = USE_CACHE)
+    contextual = res[-1]["content"].strip()
+    return contextual
+def gen_clean_view(document):
+    prompt = clean_view_template.format(document = document)
+    res = chat(prompt, use_cache = USE_CACHE)
+    ret = res[-1]["content"].strip()
+    return ret
+if __name__ == "__main__":
+    try: filename = sys.argv[1]
+    except: filename = None
+    if filename: q = open(filename, "rt").read()
+    else: q = "What's your name? Who created you?"
+    utils.reset_timer(); res = thinker_chat(q, use_cache=False)
+    utils.measure_time(LLM_HOST + " ")
+    print(f"{CYAN}{q}{RESET}", end="\n\n"); print(res[-1]["content"])

text_utils.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import re, os, sys
+from utils import *
+def get_paragraphs(text, cutoff=10):
+    return [ x.strip() for x in re.split(r'\n+', text, flags=re.MULTILINE) if len(x.strip()) > cutoff ]
+def get_para_sentences(text, cutoff=60):
+    para_sents = []
+    for para in get_paragraphs(text):
+        sents = []; sent = ""
+        chunks = re.split(r'\.+', para); n = len(chunks)
+        for i in range(0, n):
+            sent += chunks[i]
+            if i < n - 1: sent += "."
+            if len(sent) > cutoff:
+                sents.append(sent)
+                sent = ""
+        if len(sent) > 0: sents.append(sent)
+        # print(sents); input()
+        para_sents.append(sents)
+    return para_sents
+def get_idx_from_marked_chunk(marked_chunk):
+    return int(re.match(r'<C\s*(\d+)>', marked_chunk)[1])
+import random; idx = random.randint(0, 99999)
+assert get_idx_from_marked_chunk(f"<C {idx}> ha ha") == idx
+def add_chunk_markers(text, lookup_idx = None, para = True):
+    if para: para_chunks =  get_paragraphs(text)
+    else:    para_chunks = get_para_sentences(text)
+    marked_text = ""; chunk_idx = 0
+    for chunks in para_chunks:
+        if isinstance(chunks, str): chunks = [chunks]
+        for idx, chunk in enumerate(chunks):
+            marked_chunk = f"<C {chunk_idx}>{chunk.strip()}"
+            chunks[idx] = marked_chunk
+            if lookup_idx == chunk_idx: print(marked_chunk); sys.exit() # assert False, f"Đã tìm thấy {lookup_idx}"
+            marked_text += f"{marked_chunk}\n"
+            chunk_idx += 1
+        marked_text += "\n"
+    return marked_text.strip(), para_chunks
+alphabet = '[0-9a-zaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵ]'
+word = re.compile(f'{alphabet}+', re.IGNORECASE)
+###
+def hilite(query, source, hilite_color=YELLOW, source_color=GREY, query_color=None):
+    for keyword in set(re.findall(word, query)):
+        keyword = re.escape(keyword)
+        re_keyword = re.compile(rf"(\b{keyword}\b)", flags=re.IGNORECASE | re.MULTILINE)
+        if re_keyword.search(source):
+            source = re.sub(re_keyword, rf'{hilite_color}\1{source_color}', source)
+            if query_color is not None:
+                query = re.sub(re_keyword, rf'{hilite_color}\1{query_color}', query)
+    return source, query
+def pretty_num(x):
+    return round(x*100)/100
+def count_words(x):
+    assert isinstance(x, str), f"đầu không phải string {x}"
+    return len(x.split())
+def extract_(text, tag):
+    raw = text.split(f"</{tag}>")[0].split(f"<{tag}>")[-1]
+    if tag == "summary": return raw.strip()
+    splits = re.split(r'[\n,]+', raw)
+    splits = [ re.sub(r'^\s*-\s*', '', s).strip() for s in splits ]
+    splits = [ s for s in splits if len(s) > 0 ]
+    return splits
+def extract_xmls(text, tags):
+    if text is None: return None
+    return { tag: extract_(text, tag) for tag in tags }

utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import time, os
+location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+## Các màu hay dùng
+BLACK   = '\033[30m';   WHITE  = '\033[97m'
+RED     = '\033[91m';   YELLOW = '\033[33m'
+GREEN   = '\033[32m';   CYAN   = '\033[36m'
+BLUE    = '\033[94m';   GREY   = '\033[37m'
+MAGENTA = '\033[95m';   RESET  = '\033[0m'
+# Màu đậm           # Màu sáng          # Màu nền đậm       # Màu nền sáng
+DK = '\033[30m';    BK = '\033[90m';    GDK = '\033[40m';   GBK = '\033[100m'; # BLACK
+DR = '\033[31m';    BR = '\033[91m';    GDR = '\033[41m';   GBR = '\033[101m'; # RED
+DG = '\033[32m';    BG = '\033[92m';    GDG = '\033[42m';   GBG = '\033[102m'; # GREEN
+DY = '\033[33m';    BY = '\033[93m';    GDY = '\033[43m';   GBY = '\033[103m' # YELLOW
+DB = '\033[34m';    BB = '\033[94m';    GDB = '\033[44m';   GBB = '\033[104m'; # BLUE
+DM = '\033[35m';    BM = '\033[95m';    GDM = '\033[45m';   GBM = '\033[105m'; # MAGENTA (tím hồng)
+DC = '\033[36m';    BC = '\033[96m';    GDC = '\033[46m';   GBC = '\033[106m'; # CYAN
+DW = '\033[37m';    BW = '\033[97m';    GDW = '\033[47m';   GBW = '\033[107m'; # WHITE
+def pretty_num(x): return round(x*100)/100
+TIMER_STARTED_AT = { "default": time.time() }
+def reset_timer(timer="default"):
+    global TIMER_STARTED_AT
+    TIMER_STARTED_AT[timer] = time.time()
+def measure_time(message="", timer="default", color=YELLOW):
+    total = time.time() - TIMER_STARTED_AT[timer]
+    total = round(total * 100) / 100
+    message = message.strip()
+    if len(message) > 0:
+    	message = " " + message
+    print(f"{color}{timer}:{message} {total} seconds{RESET}")
+count_words = lambda x: len(x.split())
+if __name__ == "__main__":
+	reset_timer(timer="my timer")
+	s = "chào cả nhà, cả nhà khỏe không ạ?"
+	print(f"{RED}{s}{RESET} có {CYAN}{count_words(s)} từ")
+	measure_time("tổng thời gian chạy", timer="my timer")