Spaces:

TheBobBob
/

BioModelsRAG-Website_streamlit

Running

App Files Files Community

TheBobBob commited on Sep 8, 2024

Commit

2a6e32c

verified ·

1 Parent(s): 5192c1b

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -12

app.py CHANGED Viewed

@@ -152,6 +152,8 @@ def create_vector_db(final_items):
     device = "cpu"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
     for item in final_items:
@@ -165,7 +167,7 @@ def create_vector_db(final_items):
         Here is the antimony segment to summarize: {item}
         """
-        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
         response = model.generate(
             input_ids=inputs["input_ids"],
@@ -196,18 +198,15 @@ def generate_response(db, query_text, previous_context):
     import torch
     from transformers import AutoTokenizer, AutoModelForCausalLM
-    # Define model and tokenizer paths
     model_path = "nvidia/Mistral-NeMo-Minitron-8B-Base"
     tokenizer = AutoTokenizer.from_pretrained(model_path)
-    # Set device and dtype
     device = 'cuda'
     dtype = torch.bfloat16
-    # Load the model with appropriate dtype and device mapping
     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
-    # Define your prompt template
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
@@ -223,17 +222,14 @@ def generate_response(db, query_text, previous_context):
     {query_text}
     """
-    # Tokenize the input with padding and return the attention mask
-    inputs = tokenizer(prompt_template, return_tensors='pt', padding=True, truncation=True).to(model.device)
-    # Generate the model's output with attention mask
     outputs = model.generate(
         input_ids=inputs['input_ids'],
-        attention_mask=inputs['attention_mask'],  # Add attention mask to the model
-        max_length=1024  # Define a more reasonable max_length
     )
-    # Decode and print the output
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)

     device = "cpu"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    tokenizer.pad_token = tokenizer.eos_token
     model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
     for item in final_items:
         Here is the antimony segment to summarize: {item}
         """
+        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=False).to(device)
         response = model.generate(
             input_ids=inputs["input_ids"],
     import torch
     from transformers import AutoTokenizer, AutoModelForCausalLM
     model_path = "nvidia/Mistral-NeMo-Minitron-8B-Base"
     tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
     device = 'cuda'
     dtype = torch.bfloat16
     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
     prompt_template = f"""
     Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
     {query_text}
     """
+    inputs = tokenizer(prompt_template, return_tensors='pt', padding=True, truncation=False).to(model.device)
     outputs = model.generate(
         input_ids=inputs['input_ids'],
+        attention_mask=inputs['attention_mask'],
+        max_length=1024
     )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)