Spaces:

ubermenchh
/

arxiv-retrieval

Build error

App Files Files Community

ubermenchh commited on Oct 26, 2023

Commit

f610abe

1 Parent(s): 6a75c94

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -18

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from torch import bfloat16
 from langchain.llms import HuggingFacePipeline
 from langchain.vectorstores import Pinecone
 from langchain.chains import RetrievalQA
 embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
 device = 'cpu'
@@ -59,27 +60,47 @@ for i in range(0, len(data), batch_size):
     ]
     index.upsert(vectors=zip(ids, embeds, metadata))
-model_id = 'Trelis/Llama-2-7b-chat-hf-sharded-bf16'
 hf_auth = os.environ.get('HF_AUTH_KEY')
-bnb_config = transformers.BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type='nf4',
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=bfloat16,
-)
-model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
-model = transformers.AutoModelForCausalLM.from_pretrained(
     model_id,
-    trust_remote_code=True,
-    config=model_config,
-    quantization_config=bnb_config,
-    device_map='auto',
-    use_auth_token=hf_auth
 )
-model.eval()
-tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)
 generate_text = transformers.pipeline(
     model=model,
@@ -102,7 +123,7 @@ rag_pipeline = RetrievalQA.from_chain_type(
 title = 'arxiv-retrieval'
 def predict(input):
-    return rag_pipeline(input)
 gr.Interface(
     fn=predict,

 from langchain.llms import HuggingFacePipeline
 from langchain.vectorstores import Pinecone
 from langchain.chains import RetrievalQA
+import ctransformers
 embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
 device = 'cpu'
     ]
     index.upsert(vectors=zip(ids, embeds, metadata))
+#model_id = "TheBloke/Llama-2-7B-GGML"
+#model_id = "TheBloke/Llama-2-7B-chat-GGML"
+#model_id = "TheBloke/Llama-2-13B-GGML"
+model_id = "TheBloke/Llama-2-13B-chat-GGML"
 hf_auth = os.environ.get('HF_AUTH_KEY')
+# bnb_config = transformers.BitsAndBytesConfig(
+#     load_in_4bit=True,
+#     bnb_4bit_quant_type='nf4',
+#     bnb_4bit_use_double_quant=True,
+#     bnb_4bit_compute_dtype=bfloat16,
+# )
+# model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
+# model = transformers.AutoModelForCausalLM.from_pretrained(
+#     model_id,
+#     trust_remote_code=True,
+#     config=model_config,
+#     quantization_config=bnb_config,
+#     device_map='auto',
+#     use_auth_token=hf_auth
+# )
+# model.eval()
+# tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)
+## Using GGML Llama
+config = {
+    'max_new_tokens': 512,
+    'repetition_penalty': 1.1,
+    'temperature': 0.3,
+    'stream': True
+}
+model = ctransformers.AutoModelForCausalLM.from_pretrained(
     model_id,
+    model_type='llama',
+    gpu_layers=130, # 110 for 7b, 130 for 13b
+    hf=True,
+    **config
 )
+tokenizer = ctransformers.AutoTokenizer.from_pretrained(model)
 generate_text = transformers.pipeline(
     model=model,
 title = 'arxiv-retrieval'
 def predict(input):
+    return rag_pipeline(input)['result']
 gr.Interface(
     fn=predict,