import streamlit as st import pandas as pd from transformers import AutoTokenizer, AutoModelForCausalLM import torch import os from huggingface_hub import login from dotenv import load_dotenv() load_dotenv() # token = os.environ['YOUR_ACCESS_TOKEN_VARIABLE'] # Authenticate with Hugging Face def authenticate_huggingface(): token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Load token from environment variable if token: login(token) # This logs in using the Hugging Face token else: st.error("Hugging Face token not found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.") # Load the Llama 2 model from Hugging Face @st.cache_resource def load_llama_model(): authenticate_huggingface() # Ensure authentication is done before loading model_name = "meta-llama/Llama-2-7b-hf" tokenizer = AutoTokenizer.from_pretrained(model_name, token=True) model = AutoModelForCausalLM.from_pretrained(model_name, token=True) return tokenizer, model # Function to query the Llama 2 model def query_llama_model(penal_code, tokenizer, model): prompt = f"What is California Penal Code {penal_code}?" # Tokenize the input prompt inputs = tokenizer(prompt, return_tensors="pt") # Generate output from the model outputs = model.generate(**inputs, max_new_tokens=100) # Decode the generated text description = tokenizer.decode(outputs[0], skip_special_tokens=True) return description # Function to process CSV and update descriptions def update_csv_with_descriptions(csv_file, tokenizer, model): # Read the CSV file df = pd.read_csv(csv_file) # Dictionary to store penal codes and their descriptions penal_code_dict = {} # Iterate through each row in the CSV for index, row in df.iterrows(): penal_code = row['Offense Number'] # Check if description is already present if not row['Description']: st.write(f"Querying description for {penal_code}...") description = query_llama_model(penal_code, tokenizer, model) # Update the dataframe with the description df.at[index, 'Description'] = description # Add to dictionary penal_code_dict[penal_code] = description # Save the updated CSV file updated_file_path = 'updated_' + csv_file.name df.to_csv(updated_file_path, index=False) return penal_code_dict, updated_file_path # Streamlit UI def main(): st.title("Penal Code Description Extractor with Llama 2") # Load the Llama 2 model and tokenizer tokenizer, model = load_llama_model() # Upload CSV file uploaded_file = st.file_uploader("Upload a CSV file with Penal Codes", type=["csv"]) if uploaded_file is not None: # Display uploaded file st.write("Uploaded CSV File:") df = pd.read_csv(uploaded_file) st.dataframe(df) # Process the file and update descriptions if st.button("Get Penal Code Descriptions"): penal_code_dict, updated_file_path = update_csv_with_descriptions(uploaded_file, tokenizer, model) # Show dictionary output st.write("Penal Code Descriptions:") st.json(penal_code_dict) # Provide a download link for the updated CSV with open(updated_file_path, 'rb') as f: st.download_button( label="Download Updated CSV", data=f, file_name=updated_file_path, mime='text/csv' ) if __name__ == "__main__": main()