ubermenchh commited on
Commit
f610abe
1 Parent(s): 6a75c94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -18
app.py CHANGED
@@ -5,6 +5,7 @@ from torch import bfloat16
5
  from langchain.llms import HuggingFacePipeline
6
  from langchain.vectorstores import Pinecone
7
  from langchain.chains import RetrievalQA
 
8
 
9
  embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
10
  device = 'cpu'
@@ -59,27 +60,47 @@ for i in range(0, len(data), batch_size):
59
  ]
60
  index.upsert(vectors=zip(ids, embeds, metadata))
61
 
62
- model_id = 'Trelis/Llama-2-7b-chat-hf-sharded-bf16'
 
 
 
63
  hf_auth = os.environ.get('HF_AUTH_KEY')
64
 
65
- bnb_config = transformers.BitsAndBytesConfig(
66
- load_in_4bit=True,
67
- bnb_4bit_quant_type='nf4',
68
- bnb_4bit_use_double_quant=True,
69
- bnb_4bit_compute_dtype=bfloat16,
70
- )
71
- model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
72
- model = transformers.AutoModelForCausalLM.from_pretrained(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  model_id,
74
- trust_remote_code=True,
75
- config=model_config,
76
- quantization_config=bnb_config,
77
- device_map='auto',
78
- use_auth_token=hf_auth
79
  )
80
- model.eval()
81
-
82
- tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)
83
 
84
  generate_text = transformers.pipeline(
85
  model=model,
@@ -102,7 +123,7 @@ rag_pipeline = RetrievalQA.from_chain_type(
102
  title = 'arxiv-retrieval'
103
 
104
  def predict(input):
105
- return rag_pipeline(input)
106
 
107
  gr.Interface(
108
  fn=predict,
 
5
  from langchain.llms import HuggingFacePipeline
6
  from langchain.vectorstores import Pinecone
7
  from langchain.chains import RetrievalQA
8
+ import ctransformers
9
 
10
  embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
11
  device = 'cpu'
 
60
  ]
61
  index.upsert(vectors=zip(ids, embeds, metadata))
62
 
63
+ #model_id = "TheBloke/Llama-2-7B-GGML"
64
+ #model_id = "TheBloke/Llama-2-7B-chat-GGML"
65
+ #model_id = "TheBloke/Llama-2-13B-GGML"
66
+ model_id = "TheBloke/Llama-2-13B-chat-GGML"
67
  hf_auth = os.environ.get('HF_AUTH_KEY')
68
 
69
+ # bnb_config = transformers.BitsAndBytesConfig(
70
+ # load_in_4bit=True,
71
+ # bnb_4bit_quant_type='nf4',
72
+ # bnb_4bit_use_double_quant=True,
73
+ # bnb_4bit_compute_dtype=bfloat16,
74
+ # )
75
+ # model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
76
+ # model = transformers.AutoModelForCausalLM.from_pretrained(
77
+ # model_id,
78
+ # trust_remote_code=True,
79
+ # config=model_config,
80
+ # quantization_config=bnb_config,
81
+ # device_map='auto',
82
+ # use_auth_token=hf_auth
83
+ # )
84
+ # model.eval()
85
+
86
+ # tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)
87
+
88
+ ## Using GGML Llama
89
+
90
+ config = {
91
+ 'max_new_tokens': 512,
92
+ 'repetition_penalty': 1.1,
93
+ 'temperature': 0.3,
94
+ 'stream': True
95
+ }
96
+ model = ctransformers.AutoModelForCausalLM.from_pretrained(
97
  model_id,
98
+ model_type='llama',
99
+ gpu_layers=130, # 110 for 7b, 130 for 13b
100
+ hf=True,
101
+ **config
 
102
  )
103
+ tokenizer = ctransformers.AutoTokenizer.from_pretrained(model)
 
 
104
 
105
  generate_text = transformers.pipeline(
106
  model=model,
 
123
  title = 'arxiv-retrieval'
124
 
125
  def predict(input):
126
+ return rag_pipeline(input)['result']
127
 
128
  gr.Interface(
129
  fn=predict,