Spaces:

Abbeite
/

workout_4

Sleeping

App Files Files Community

Abbeite commited on Mar 7, 2024

Commit

8be321d

verified ·

1 Parent(s): 1dc34bb

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -61

app.py CHANGED Viewed

@@ -1,66 +1,118 @@
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import fitz  # PyMuPDF
 import torch
-# Function to load the PDF document
-@st.cache(allow_output_mutation=True)
-def load_pdf_document(file_path):
-    text = ""
-    with fitz.open(file_path) as doc:
-        for page in doc:
-            text += page.get_text()
-    return text
-# Function to load the model and tokenizer
-@st.cache(allow_output_mutation=True)
-def load_model_and_tokenizer(model_name):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
     return tokenizer, model
-# Function to generate an answer from the model
-def generate_answer(context, query, tokenizer, model):
-    # Aim to use a chunk of the context that keeps us within the max model input size
-    # This is a simplified approach: in practice, you'd want to find a more intelligent way to select the relevant part of the context
-    max_context_length = tokenizer.model_max_length - len(tokenizer.encode(query, add_special_tokens=True)) - 50  # Adjust buffer as needed
-    if len(tokenizer.encode(context, add_special_tokens=False)) > max_context_length:
-        # If context is too long, truncate it from the beginning (simple approach)
-        start_index = len(tokenizer.encode(context, add_special_tokens=False)) - max_context_length
-        truncated_context = tokenizer.decode(tokenizer.encode(context, add_special_tokens=False)[start_index:])
-    else:
-        truncated_context = context
-    encoded_input = tokenizer.encode_plus(query, truncated_context, add_special_tokens=True, return_tensors="pt", truncation=True)
-    input_ids = encoded_input["input_ids"]
-    attention_mask = encoded_input["attention_mask"]
-    # Use max_new_tokens to control the length of the generated content
-    output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=150, num_return_sequences=1, temperature=0.7, top_p=0.9)
-    answer = tokenizer.decode(output[0], skip_special_tokens=True)
-    return answer
-# Streamlit UI
-st.title("Question Answering with LLaMA 2")
-document_path = "jeff_wo.pdf"
-document_text = load_pdf_document(document_path)
-# Optional: Display the document text or a portion of it
-st.text_area("Document Text (preview)", value=document_text[:1000], height=250, help="Preview of the document text.")
-# Load model and tokenizer
-model_name = "NousResearch/Llama-2-7b-chat-hf"
-tokenizer, model = load_model_and_tokenizer(model_name)
-# User input for the query
-query = st.text_input("Enter your question:", "")
-if st.button("Generate Answer"):
-    if query:
-        with st.spinner("Generating answer..."):
-            answer = generate_answer(document_text, query, tokenizer, model)
-            st.write(answer)
-    else:
-        st.error("Please enter a question to get an answer.")

 import streamlit as st
+# Import transformer classes for generaiton
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
+# Import torch for datatype attributes
 import torch
+# Import the prompt wrapper...but for llama index
+from llama_index.prompts.prompts import SimpleInputPrompt
+# Import the llama index HF Wrapper
+from llama_index.llms import HuggingFaceLLM
+# Bring in embeddings wrapper
+from llama_index.embeddings import LangchainEmbedding
+# Bring in HF embeddings - need these to represent document chunks
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+# Bring in stuff to change service context
+from llama_index import set_global_service_context
+from llama_index import ServiceContext
+# Import deps to load documents
+from llama_index import VectorStoreIndex, download_loader
+from pathlib import Path
+# Define variable to hold llama2 weights naming
+name = "meta-llama/Llama-2-70b-chat-hf"
+# Set auth token variable from hugging face
+auth_token = "YOUR HUGGING FACE AUTH TOKEN HERE"
+@st.cache_resource
+def get_tokenizer_model():
+    # Create tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(name, cache_dir='./model/', use_auth_token=auth_token)
+    # Create model
+    model = AutoModelForCausalLM.from_pretrained(name, cache_dir='./model/'
+                            , use_auth_token=auth_token, torch_dtype=torch.float16,
+                            rope_scaling={"type": "dynamic", "factor": 2}, load_in_8bit=True)
     return tokenizer, model
+tokenizer, model = get_tokenizer_model()
+# Create a system prompt
+system_prompt = """<s>[INST] <<SYS>>
+You are a helpful, respectful and honest assistant. Always answer as
+helpfully as possible, while being safe. Your answers should not include
+any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
+Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain
+why instead of answering something not correct. If you don't know the answer
+to a question, please don't share false information.
+Your goal is to provide answers relating to the workout science and informatins in the documentSYS>>
+"""
+# Throw together the query wrapper
+query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")
+llm = HuggingFaceLLM(context_window=1024,
+                    max_new_tokens=128,
+                    system_prompt=system_prompt,
+                    query_wrapper_prompt=query_wrapper_prompt,
+                    model=model,
+                    tokenizer=tokenizer)
+# Create and dl embeddings instance
+embeddings=LangchainEmbedding(
+    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+)
+# Create new service context instance
+service_context = ServiceContext.from_defaults(
+    chunk_size=1024,
+    llm=llm,
+    embed_model=embeddings
+)
+# And set the service context
+set_global_service_context(service_context)
+# Download PDF Loader
+PyMuPDFReader = download_loader("PyMuPDFReader")
+# Create PDF Loader
+loader = PyMuPDFReader()
+# Load documents
+documents = loader.load(file_path=Path('./data/annualreport.pdf'), metadata=True)
+# Download PDF Loader
+PyMuPDFReader = download_loader("PyMuPDFReader")
+# Create PDF Loader
+loader = PyMuPDFReader()
+# Load documents
+documents = loader.load(file_path=Path('jeff_wo.pdf'), metadata=True)
+# Create an index - we'll be able to query this in a sec
+index = VectorStoreIndex.from_documents(documents)
+# Setup index query engine using LLM
+query_engine = index.as_query_engine()
+# Create centered main title
+st.title('🦙 Llama Banker')
+# Create a text input box for the user
+prompt = st.text_input('Input your prompt here')
+# If the user hits enter
+if prompt:
+    response = query_engine.query(prompt)
+    # ...and write it out to the screen
+    st.write(response)
+    # Display raw response object
+    with st.expander('Response Object'):
+        st.write(response)
+    # Display source text
+    with st.expander('Source Text'):
+        st.write(response.get_formatted_sources())