Darka001 commited on
Commit
431e86e
·
verified ·
1 Parent(s): 6e8f6be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -109
app.py CHANGED
@@ -2,134 +2,134 @@ import gradio as gr
2
  from gradio_calendar import Calendar
3
 
4
 
5
- # # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
6
- # # from langchain_community.vectorstores import Chroma
7
- # # from langchain_core.output_parsers import StrOutputParser
8
- # import torch
9
- # from transformers import (
10
- # AutoModelForCausalLM,
11
- # AutoTokenizer,
12
- # BitsAndBytesConfig,
13
- # pipeline,
14
- # StoppingCriteria, StoppingCriteriaList
15
- # )
16
 
17
- # from langchain.prompts import PromptTemplate
18
- # from langchain_community.llms import HuggingFacePipeline
19
- # from langchain.chains import LLMChain
20
- # from langchain_core.runnables import RunnablePassthrough, RunnableParallel
 
 
 
 
 
 
 
21
 
 
 
 
 
22
 
23
- # # instructor_embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large",
24
- # # model_kwargs={"device": "cuda"})
25
 
 
 
26
 
27
 
28
- # model_name='SherlockAssistant/Mistral-7B-Instruct-Ukrainian'
29
 
30
- # tokenizer = AutoTokenizer.from_pretrained(model_name)
31
- # #tokenizer.pad_token = tokenizer.unk_token
32
- # #tokenizer.padding_side = "right"
33
 
34
 
35
- # # # Activate 4-bit precision base model loading
36
- # # use_4bit = True
37
 
38
- # # # Compute dtype for 4-bit base models
39
- # # bnb_4bit_compute_dtype = "float16"
40
 
41
- # # # Quantization type (fp4 or nf4)
42
- # # bnb_4bit_quant_type = "nf4"
43
 
44
- # # # Activate nested quantization for 4-bit base models (double quantization)
45
- # # use_nested_quant = False
46
 
47
- # # #################################################################
48
- # # # Set up quantization config
49
- # # #################################################################
50
- # # compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
51
 
52
- # # bnb_config = BitsAndBytesConfig(
53
- # # load_in_4bit=use_4bit,
54
- # # bnb_4bit_quant_type=bnb_4bit_quant_type,
55
- # # bnb_4bit_compute_dtype=compute_dtype,
56
- # # bnb_4bit_use_double_quant=use_nested_quant,
57
- # # )
58
 
59
- # # # Check GPU compatibility with bfloat16
60
- # # if compute_dtype == torch.float16 and use_4bit:
61
- # # major, _ = torch.cuda.get_device_capability()
62
- # # if major >= 8:
63
- # # print("=" * 80)
64
- # # print("Your GPU supports bfloat16: accelerate training with bf16=True")
65
- # # print("=" * 80)
66
 
67
 
68
- # model = AutoModelForCausalLM.from_pretrained(
69
- # model_name)
70
- # stop_list = [" \n\nAnswer:", " \n", " \n\n"]
71
- # stop_token_ids = [tokenizer(x, return_tensors='pt', add_special_tokens=False)['input_ids'] for x in stop_list]
72
- # stop_token_ids = [torch.LongTensor(x).to("cuda") for x in stop_token_ids]
73
 
74
- # class StopOnTokens(StoppingCriteria):
75
- # def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
76
- # for stop_ids in stop_token_ids:
77
- # if torch.eq(input_ids[0][-len(stop_ids[0])+1:], stop_ids[0][1:]).all():
78
- # return True
79
- # return False
80
 
81
- # stopping_criteria = StoppingCriteriaList([StopOnTokens()])
82
-
83
-
84
- # text_generation_pipeline = pipeline(
85
- # model=model,
86
- # tokenizer=tokenizer,
87
- # task="text-generation",
88
- # temperature=0.01,
89
- # repetition_penalty=1.2,
90
- # return_full_text=True,
91
- # max_new_tokens=750, do_sample=True,
92
- # top_k=50, top_p=0.95,
93
- # stopping_criteria=stopping_criteria
94
- # )
95
- # mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
96
- # # # # load chroma from disk
97
- # db3 = Chroma(persist_directory="/chroma/", embedding_function=instructor_embeddings)
98
-
99
-
100
-
101
-
102
- # retriever = db3.as_retriever(search_type="similarity_score_threshold",
103
- # search_kwargs={"score_threshold": .5,
104
- # "k": 20})
105
-
106
- # #retriever = db3.as_retriever(search_kwargs={"k":15})
107
- # # Get pre-written rag prompt
108
- # def format_docs(docs):
109
- # return "\n\n".join(doc.page_content for doc in docs)
110
-
111
-
112
- # template ="""" [INST] Ти асистент для надання відповідей з законодавства України. Використовуй лише вказаний нижче Context максимально точно. Описуй лише події простими словами без формальностей. Пиши чотири речення і будь максимально точним. Якщо контекст пустий - пиши "Я не маю релевантної інформації. Спробуйте ще".
113
- # Context: {context}
114
- # ### QUESTION:
115
- # {question}
116
- # [/INST]
117
- # """
118
- # prompt = PromptTemplate(
119
- # input_variables=["context", "question"],
120
- # template=template,
121
- # )
122
-
123
- # rag_chain_from_docs = (
124
- # RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
125
- # | prompt
126
- # | mistral_llm
127
- # | StrOutputParser()
128
- # )
129
-
130
- # rag_chain_with_source = RunnableParallel(
131
- # {"context": retriever, "question": RunnablePassthrough()}
132
- # ).assign(answer=rag_chain_from_docs)
133
 
134
 
135
 
 
2
  from gradio_calendar import Calendar
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
7
+ # from langchain_community.vectorstores import Chroma
8
+ # from langchain_core.output_parsers import StrOutputParser
9
+ import torch
10
+ from transformers import (
11
+ AutoModelForCausalLM,
12
+ AutoTokenizer,
13
+ BitsAndBytesConfig,
14
+ pipeline,
15
+ StoppingCriteria, StoppingCriteriaList
16
+ )
17
 
18
+ from langchain.prompts import PromptTemplate
19
+ from langchain_community.llms import HuggingFacePipeline
20
+ from langchain.chains import LLMChain
21
+ from langchain_core.runnables import RunnablePassthrough, RunnableParallel
22
 
 
 
23
 
24
+ instructor_embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large",
25
+ model_kwargs={"device": "cuda"})
26
 
27
 
28
+ model_name='SherlockAssistant/Mistral-7B-Instruct-Ukrainian'
29
 
30
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ #tokenizer.pad_token = tokenizer.unk_token
32
+ #tokenizer.padding_side = "right"
33
 
34
 
35
+ # # Activate 4-bit precision base model loading
36
+ # use_4bit = True
37
 
38
+ # # Compute dtype for 4-bit base models
39
+ # bnb_4bit_compute_dtype = "float16"
40
 
41
+ # # Quantization type (fp4 or nf4)
42
+ # bnb_4bit_quant_type = "nf4"
43
 
44
+ # # Activate nested quantization for 4-bit base models (double quantization)
45
+ # use_nested_quant = False
46
 
47
+ # #################################################################
48
+ # # Set up quantization config
49
+ # #################################################################
50
+ # compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
51
 
52
+ # bnb_config = BitsAndBytesConfig(
53
+ # load_in_4bit=use_4bit,
54
+ # bnb_4bit_quant_type=bnb_4bit_quant_type,
55
+ # bnb_4bit_compute_dtype=compute_dtype,
56
+ # bnb_4bit_use_double_quant=use_nested_quant,
57
+ # )
58
 
59
+ # # Check GPU compatibility with bfloat16
60
+ # if compute_dtype == torch.float16 and use_4bit:
61
+ # major, _ = torch.cuda.get_device_capability()
62
+ # if major >= 8:
63
+ # print("=" * 80)
64
+ # print("Your GPU supports bfloat16: accelerate training with bf16=True")
65
+ # print("=" * 80)
66
 
67
 
68
+ model = AutoModelForCausalLM.from_pretrained(
69
+ model_name)
70
+ stop_list = [" \n\nAnswer:", " \n", " \n\n"]
71
+ stop_token_ids = [tokenizer(x, return_tensors='pt', add_special_tokens=False)['input_ids'] for x in stop_list]
72
+ stop_token_ids = [torch.LongTensor(x).to("cuda") for x in stop_token_ids]
73
 
74
+ class StopOnTokens(StoppingCriteria):
75
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
76
+ for stop_ids in stop_token_ids:
77
+ if torch.eq(input_ids[0][-len(stop_ids[0])+1:], stop_ids[0][1:]).all():
78
+ return True
79
+ return False
80
 
81
+ stopping_criteria = StoppingCriteriaList([StopOnTokens()])
82
+
83
+
84
+ text_generation_pipeline = pipeline(
85
+ model=model,
86
+ tokenizer=tokenizer,
87
+ task="text-generation",
88
+ temperature=0.01,
89
+ repetition_penalty=1.2,
90
+ return_full_text=True,
91
+ max_new_tokens=750, do_sample=True,
92
+ top_k=50, top_p=0.95,
93
+ stopping_criteria=stopping_criteria
94
+ )
95
+ mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
96
+ # # # load chroma from disk
97
+ db3 = Chroma(persist_directory="/chroma/", embedding_function=instructor_embeddings)
98
+
99
+
100
+
101
+
102
+ retriever = db3.as_retriever(search_type="similarity_score_threshold",
103
+ search_kwargs={"score_threshold": .5,
104
+ "k": 20})
105
+
106
+ #retriever = db3.as_retriever(search_kwargs={"k":15})
107
+ # Get pre-written rag prompt
108
+ def format_docs(docs):
109
+ return "\n\n".join(doc.page_content for doc in docs)
110
+
111
+
112
+ template ="""" [INST] Ти асистент для надання відповідей з законодавства України. Використовуй лише вказаний нижче Context максимально точно. Описуй лише події простими словами без формальностей. Пиши чотири речення і будь максимально точним. Якщо контекст пустий - пиши "Я не маю релевантної інформації. Спробуйте ще".
113
+ Context: {context}
114
+ ### QUESTION:
115
+ {question}
116
+ [/INST]
117
+ """
118
+ prompt = PromptTemplate(
119
+ input_variables=["context", "question"],
120
+ template=template,
121
+ )
122
+
123
+ rag_chain_from_docs = (
124
+ RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
125
+ | prompt
126
+ | mistral_llm
127
+ | StrOutputParser()
128
+ )
129
+
130
+ rag_chain_with_source = RunnableParallel(
131
+ {"context": retriever, "question": RunnablePassthrough()}
132
+ ).assign(answer=rag_chain_from_docs)
133
 
134
 
135