# import torch; torch.version.cuda # # from huggingface_hub import login, HfFolder # import subprocess # # import getpass # # def run_sudo_command(cmd): # # try: # # password = getpass.getpass(prompt="Enter your sudo password: ") # Securely get the password # # result = subprocess.run(["sudo", "-S"] + cmd, input=password.encode(), capture_output=True, text=True, check=True) # # print(result.stdout) # # except subprocess.CalledProcessError as e: # # print(f"Error executing command: {e.stderr}") # # # Run the ldconfig command # # run_sudo_command(["ldconfig", "/usr/lib64-nvidia"]) # def run_command(cmd, shell=False): # """Runs a shell command and prints the output.""" # try: # result = subprocess.run(cmd, shell=shell, capture_output=True, text=True, check=True) # print(result.stdout) # except subprocess.CalledProcessError as e: # print(f"Error executing command: {e.stderr}") # subprocess.run(["pip", "install", "--upgrade", "pip"], check=True) # # subprocess.run(["pip", "install", "--upgrade", "torch"], check=True) # # subprocess.run(["pip", "install", "--upgrade", "transformers"], check=True) # # Pip install command as a list # pip_command = [ # "pip", # "install", # "--upgrade", # "--force-reinstall", # "--no-cache-dir", # "torch==2.1.1", # "triton", # "--index-url", # "https://download.pytorch.org/whl/cu121" # ] # run_command(pip_command) # run_command(["pip", "install", "--no-deps", "trl", "peft", "accelerate", "bitsandbytes"]) # # subprocess.run(["pip", "install", "--upgrade", "peft"], check=True) # subprocess.run(["pip", "install", "xformers"], check=True) # # subprocess.run(["pip", "install", "--upgrade", "accelerate"], check=True) # subprocess.run(["unsloth[cu121-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git"], check=True) # import subprocess # # 1. Create the conda environment # run_command(["conda", "create", "-y", "--name", "unsloth_env", "python=3.10"]) # # 2. Activate the environment (Note: Requires shell=True) # run_command("conda activate unsloth_env", shell=True) # # 3. Install PyTorch and related packages with conda # run_command("conda install pytorch-cuda=<12.1/11.8> pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers") # # 4. Install unsloth from the GitHub repository with pip # run_command("pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"") # # 5. Install additional pip packages without dependencies # run_command("pip install --no-deps trl peft accelerate bitsandbytes") import subprocess def run_command(cmd): try: result = subprocess.run(cmd, capture_output=True, text=True, check=True) print(result.stdout) except subprocess.CalledProcessError as e: print(f"Error executing command: {e.stderr}") # Pip install xformers run_command([ "pip", "install", "-U", "xformers<0.0.26", "--index-url", "https://download.pytorch.org/whl/cu121" ]) # Pip install unsloth from GitHub run_command([ "pip", "install", "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git" ]) import os HF_TOKEN = os.environ["HF_TOKEN"] import re import spaces import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig # from peft import PeftModel, PeftConfig # tokenizer = AutoTokenizer.from_pretrained("FlawedLLM/Bhashini_00") # quantization_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_use_double_quant=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype=torch.float16) # config=AutoConfig.from_pretrained("FlawedLLM/Bhashini_00") # model = AutoModelForCausalLM.from_pretrained("FlawedLLM/Bhashini_00", # device_map="auto", # quantization_config=quantization_config, # torch_dtype =torch.float16, # low_cpu_mem_usage=True, # use_safetensors=True, # ) # # Assuming you have your HF repository in this format: "your_username/your_model_name" # model_id = "FlawedLLM/BhashiniLLM" # # Load the base model (the one you fine-tuned with LoRA) # base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto') # Load in 8-bit for efficiency # for param in base_model.parameters(): # param.data = param.data.to(torch.float16) # or torch.float32 # # Load the LoRA adapter weights # model = PeftModel.from_pretrained(base_model, model_id) # tokenizer = AutoTokenizer.from_pretrained(model_id) # model = AutoModel.from_pretrained("FlawedLLM/Bhashini", load_in_4bit=True, device_map='auto') # I highly do NOT suggest - use Unsloth if possible # from peft import AutoPeftModelForCausalLM # from transformers import AutoTokenizer # model = AutoPeftModelForCausalLM.from_pretrained( # "FlawedLLM/Bhashini", # YOUR MODEL YOU USED FOR TRAINING # load_in_4bit = True, # ) # tokenizer = AutoTokenizer.from_pretrained("FlawedLLM/Bhashini") # # Load model directly # from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig # tokenizer = AutoTokenizer.from_pretrained("FlawedLLM/Bhashini_9") # config = AutoConfig.from_pretrained("FlawedLLM/Bhashini_9") # Load configuration # # quantization_config = BitsAndBytesConfig( # # load_in_4bit=True, # # bnb_4bit_use_double_quant=True, # # bnb_4bit_quant_type="nf4", # # bnb_4bit_compute_dtype=torch.float16 # # ) # # torch_dtype =torch.float16 # model = AutoModelForCausalLM.from_pretrained("FlawedLLM/Bhashini_9",config=config, ignore_mismatched_sizes=True).to('cuda') # Load model directly # tokenizer = AutoTokenizer.from_pretrained("FlawedLLM/Bhashini89", trust_remote_code=True) # quantization_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_use_double_quant=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype=torch.float16) # model = AutoModelForCausalLM.from_pretrained("FlawedLLM/Bhashini89", # device_map="auto", # quantization_config=quantization_config, # torch_dtype =torch.float16, # low_cpu_mem_usage=True, # use_safetensors=True, # trust_remote_code=True) from unsloth import FastLanguageModel model, tokenizer = FastLanguageModel.from_pretrained( model_name = "FlawedLLM/Bhashini_gemma_lora_clean_final", # YOUR MODEL YOU USED FOR TRAINING max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit,) FastLanguageModel.for_inference(model) # Enable native 2x faster inference # alpaca_prompt = You MUST copy from above! @spaces.GPU(duration=300) def chunk_it(input_command, item_list): alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" if item_list is not None: item_list = f'''The ItemName should be chosen from the given list : {item_list} , except when adding item. If ItemName does not find anything SIMILAR in the list, then the ItemName should be "Null" ''' inputs = tokenizer( [ alpaca_prompt.format( f''' You will receive text input that you need to analyze to perform the following tasks: transaction: Record the details of an item transaction. last n days transactions: Retrieve transaction records for a specified time period. view risk inventory: View inventory items based on a risk category. view inventory: View inventory details. new items: Add new items to the inventory. old items: View old items in inventory. report generation: Generate various inventory reports. Required Parameters: Each task requires specific parameters to execute correctly: transaction: ItemName (string) ItemQt (quantity - integer) Type (string: "sale" or "purchase" or "return") ShelfNo (string or integer) ReorderPoint (integer) last n days transactions: ItemName (string) Duration (integer: number of days) view risk inventory: RiskType (string: "overstock", "understock", or Null for all risk types) view inventory: ItemName (string) ShelfNo (string or integer) new items: ItemName (string) SellingPrice (number) CostPrice (number) old items: ShelfNo (string or integer) report generation: ItemName (string) Duration (integer: number of days) ReportType (string: "profit", "revenue", "inventory", or Null for all reports) {item_list} ALWAYS provide output in a JSON format.''', # instruction input_command, # input "", # output - leave this blank for generation! ) ], return_tensors = "pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens = 216, use_cache = True) tokenizer.batch_decode(outputs) reply=tokenizer.batch_decode(outputs) # Regular expression pattern to match content between "### Response:" and "<|end_of_text|>" pattern = r"### Response:\n(.*?)<\|end_of_text\|>" # Search for the pattern in the text match = re.search(pattern, reply[0], re.DOTALL) # re.DOTALL allows '.' to match newlines reply = match.group(1).strip() # Extract and remove extra whitespace return reply # iface=gr.Interface(fn=chunk_it, # inputs="text", # inputs="text", # outputs="text", # title="Formatter_Pro", # ) iface = gr.Interface( fn=chunk_it, inputs=[ gr.Textbox(label="Input Command", lines=3), gr.Textbox(label="Item List", lines=5) ], outputs="text", title="Formatter Pro", ) iface.launch(inline=False)