import streamlit as st import pandas as pd import numpy as np import joblib import matplotlib.pyplot as plt import os import openai from sklearn.preprocessing import LabelEncoder import requests from io import BytesIO import gdown # --- Set page configuration --- st.set_page_config( page_title="The Guide", page_icon="🚗", layout="wide", initial_sidebar_state="expanded" ) # --- Custom CSS for better styling --- st.markdown(""" """, unsafe_allow_html=True) # --- Cache functions --- def create_brand_categories(): return { 'luxury_brands': { 'rolls-royce': (300000, 600000), 'bentley': (200000, 500000), 'lamborghini': (250000, 550000), 'ferrari': (250000, 600000), 'mclaren': (200000, 500000), 'aston-martin': (150000, 400000), 'maserati': (100000, 300000) }, 'premium_brands': { 'porsche': (60000, 150000), 'bmw': (40000, 90000), 'mercedes-benz': (45000, 95000), 'audi': (35000, 85000), 'lexus': (40000, 80000), 'jaguar': (45000, 90000), 'land-rover': (40000, 90000), 'volvo': (35000, 75000), 'infiniti': (35000, 70000), 'cadillac': (40000, 85000), 'tesla': (40000, 100000) }, 'mid_tier_brands': { 'acura': (30000, 50000), 'lincoln': (35000, 65000), 'buick': (25000, 45000), 'chrysler': (25000, 45000), 'alfa-romeo': (35000, 60000), 'genesis': (35000, 60000) }, 'standard_brands': { 'toyota': (20000, 35000), 'honda': (20000, 35000), 'volkswagen': (20000, 35000), 'mazda': (20000, 32000), 'subaru': (22000, 35000), 'hyundai': (18000, 32000), 'kia': (17000, 30000), 'ford': (20000, 40000), 'chevrolet': (20000, 38000), 'gmc': (25000, 45000), 'jeep': (25000, 45000), 'dodge': (22000, 40000), 'ram': (25000, 45000), 'nissan': (18000, 32000) }, 'economy_brands': { 'mitsubishi': (15000, 25000), 'suzuki': (12000, 22000), 'fiat': (15000, 25000), 'mini': (20000, 35000), 'smart': (15000, 25000) }, 'discontinued_brands': { 'pontiac': (5000, 15000), 'saturn': (4000, 12000), 'mercury': (4000, 12000), 'oldsmobile': (3000, 10000), 'plymouth': (3000, 10000), 'saab': (5000, 15000) } } @st.cache_resource def download_file_from_google_drive(file_id): """Downloads a file from Google Drive using gdown.""" url = f"https://drive.google.com/uc?id={file_id}" try: with st.spinner('Downloading from Google Drive...'): output = f"temp_{file_id}.pkl" gdown.download(url, output, quiet=False) with open(output, 'rb') as f: content = f.read() # Clean up the temporary file os.remove(output) return content except Exception as e: st.error(f"Error downloading from Google Drive: {str(e)}") raise e @st.cache_data def load_datasets(): """Load the dataset from Google Drive.""" dataset_file_id = "1emG-BQ3-x4xsMAGMEznkh1ACdlAj5Dn1" try: with st.spinner('Loading dataset...'): content = download_file_from_google_drive(dataset_file_id) # Use BytesIO to read the CSV content original_data = pd.read_csv(BytesIO(content), low_memory=False) # Ensure column names match the model's expectations original_data.columns = original_data.columns.str.strip().str.capitalize() return original_data except Exception as e: st.error(f"Error loading dataset: {str(e)}") raise e @st.cache_resource def load_model_and_encodings(): """Load model from Google Drive and create encodings.""" model_file_id = "1wKixkdW2pVKEpJW-N1QIyKUr2nYirU7I" try: # Show loading message with st.spinner('Loading model...'): model_content = download_file_from_google_drive(model_file_id) model = joblib.load(BytesIO(model_content)) # Load data for encodings original_data = load_datasets() # Create fresh encoders from data label_encoders = {} categorical_features = ['Make', 'model', 'condition', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color'] for feature in categorical_features: if feature in original_data.columns: le = LabelEncoder() unique_values = original_data[feature].fillna('unknown').str.strip().unique() le.fit(unique_values) label_encoders[feature.lower()] = le return model, label_encoders except Exception as e: st.error(f"Error loading model: {str(e)}") raise e # --- Load data and models --- try: original_data = load_datasets() model, label_encoders = load_model_and_encodings() # Using the new function except Exception as e: st.error(f"Error loading data or models: {str(e)}") st.stop() # --- Define categorical and numeric features --- # From model.py # --- Define features --- numeric_features = ['year', 'odometer', 'age', 'age_squared', 'mileage_per_year'] # Update the categorical features list to use lowercase categorical_features = ['make', 'model', 'condition', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color'] required_features = numeric_features + categorical_features # --- Feature engineering functions --- def create_features(df): df = df.copy() current_year = 2024 df['age'] = current_year - df['year'] df['age_squared'] = df['age'] ** 2 df['mileage_per_year'] = np.clip(df['odometer'] / (df['age'] + 1), 0, 200000) return df def prepare_input(input_dict, label_encoders): # Convert None values to 'unknown' for safe handling input_dict = {k: v if v is not None else 'unknown' for k, v in input_dict.items()} # Convert input dictionary to DataFrame input_df = pd.DataFrame([input_dict]) # Ensure columns match the model's expected casing feature_name_mapping = { "make": "Make", # Match casing for 'Make' "model": "Model", # Match casing for 'Model' "condition": "Condition", "fuel": "Fuel", "title_status": "Title_status", "transmission": "Transmission", "drive": "Drive", "size": "Size", "type": "Type", "paint_color": "Paint_color", "year": "Year", "odometer": "Odometer", "age": "Age", "age_squared": "Age_squared", "mileage_per_year": "Mileage_per_year" } input_df.rename(columns=feature_name_mapping, inplace=True) # Numeric feature conversions input_df["Year"] = pd.to_numeric(input_df.get("Year", 0), errors="coerce") input_df["Odometer"] = pd.to_numeric(input_df.get("Odometer", 0), errors="coerce") # Feature engineering current_year = 2024 input_df["Age"] = current_year - input_df["Year"] input_df["Age_squared"] = input_df["Age"] ** 2 input_df["Mileage_per_year"] = input_df["Odometer"] / (input_df["Age"] + 1) input_df["Mileage_per_year"] = input_df["Mileage_per_year"].clip(0, 200000) # Encode categorical features for feature, encoded_feature in feature_name_mapping.items(): if feature in label_encoders: input_df[encoded_feature] = input_df[encoded_feature].fillna("unknown").astype(str).str.strip() try: input_df[encoded_feature] = label_encoders[feature].transform(input_df[encoded_feature]) except ValueError: input_df[encoded_feature] = 0 # Assign default for unseen values # Ensure all required features are present for feature in model.feature_names_in_: if feature not in input_df: input_df[feature] = 0 # Default value for missing features # Reorder columns input_df = input_df[model.feature_names_in_] return input_df # --- Styling functions --- st.markdown(""" """, unsafe_allow_html=True) def style_metric_container(label, value): st.markdown(f"""

{label}

{value}

""", unsafe_allow_html=True) def search_dataset(dataset, make, model=None): """ Search the dataset for the specified make and model. If no model is provided, search by make only. Return relevant information if found. """ # Filter by make and model query = dataset[dataset['Make'].str.lower() == make.lower()] if model: query = query[query['Model'].str.lower() == model.lower()] if not query.empty: # If matching rows exist, return a formatted response results = query[['Year', 'Make', 'Model', 'Price']].head(5) # Adjust columns as needed return results else: # No relevant data found in the dataset return None # --- Updated GPT Functionality --- def generate_gpt_response(prompt, dataset): """ First look up the dataset for relevant information. If no matches are found, generate a GPT response. """ # Extract make and model from the prompt (simplified NLP parsing) prompt_lower = prompt.lower() make = None model = None # Example: Parse make and model from user query for word in prompt_lower.split(): if word in dataset['Make'].str.lower().unique(): make = word elif word in dataset['Model'].str.lower().unique(): model = word # If we find relevant data, use it to respond if make: dataset_response = search_dataset(dataset, make, model) if dataset_response is not None: st.write("### Dataset Match Found") st.dataframe(dataset_response) # Show results to the user return f"I found some information in our dataset about {make.title()} {model.title() if model else ''}. Please see the details above." # If no match is found, fall back to GPT response openai.api_key = "sk-your-api-key" # Ensure the API key is set system_message = { "role": "system", "content": ( "You are a helpful car shopping assistant. Provide car recommendations or pricing estimates. " "If the dataset lacks information, generate an appropriate response." ) } messages = [system_message, {"role": "user", "content": prompt}] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, max_tokens=500, temperature=0.7, ) return response['choices'][0]['message']['content'] # --- Assistant Section --- def create_assistant_section(dataset): st.markdown("""

🤖 Car Shopping Assistant

Ask me anything about cars! For example: 'What's a good car under $30,000 with low mileage?'

""", unsafe_allow_html=True) if "assistant_responses" not in st.session_state: st.session_state.assistant_responses = [] prompt = st.text_input("Ask about car recommendations or pricing...", placeholder="Type your question here...") if prompt: try: response = generate_gpt_response(prompt, dataset) st.session_state.assistant_responses.append(response) except Exception as e: response = f"Sorry, I encountered an error: {str(e)}" st.session_state.assistant_responses.append(response) # Display the latest response st.write(response) # Optionally display previous responses if len(st.session_state.assistant_responses) > 1: st.markdown("### Previous Responses") for prev_response in st.session_state.assistant_responses[:-1]: st.markdown("---") st.write(prev_response) if st.button("Clear Chat"): st.session_state.assistant_responses = [] st.experimental_rerun() # --- Prediction Interface --- def create_prediction_interface(): with st.sidebar: st.markdown("""

Car Details

""", unsafe_allow_html=True) # Year slider year = st.slider("Year", min_value=1980, max_value=2024, value=2022) # Make selection make_options = sorted(original_data['Make'].dropna().unique()) # Correct casing for 'Make' make = st.selectbox("Make", options=make_options) # Filter models based on selected make filtered_models = sorted(original_data[original_data['Make'] == make]['Model'].dropna().unique()) # Match 'Model' casing model_name = st.selectbox("Model", options=filtered_models if len(filtered_models) > 0 else ["No models available"]) if model_name == "No models available": st.warning("No models are available for the selected make.") # Additional inputs condition = st.selectbox("Condition", ['new', 'like new', 'excellent', 'good', 'fair', 'salvage', 'parts only']) fuel = st.selectbox("Fuel Type", sorted(original_data['Fuel'].fillna('Unknown').unique())) # Match casing for 'Fuel' odometer = st.number_input("Odometer (miles)", min_value=0, value=20000, format="%d", step=1000) title_status = st.selectbox("Title Status", sorted(original_data['Title_status'].fillna('Unknown').unique())) # Match casing transmission = st.selectbox("Transmission", sorted(original_data['Transmission'].fillna('Unknown').unique())) drive = st.selectbox("Drive Type", sorted(original_data['Drive'].fillna('Unknown').unique())) size = st.selectbox("Size", sorted(original_data['Size'].fillna('Unknown').unique())) paint_color = st.selectbox("Paint Color", sorted(original_data['Paint_color'].fillna('Unknown').unique())) car_type = 'sedan' # Default type # Prediction button predict_button = st.button("📊 Predict Price", use_container_width=True) return { 'year': year, 'make': make.strip(), # Use correctly cased `make` 'model': model_name if model_name != "No models available" else 'unknown', 'condition': condition.lower().strip(), 'fuel': fuel.lower().strip(), 'odometer': odometer, 'title_status': title_status.lower().strip(), 'transmission': transmission.lower().strip(), 'drive': drive.lower().strip(), 'size': size.lower().strip(), 'type': car_type.lower().strip(), 'paint_color': paint_color.lower().strip() }, predict_button def create_market_trends_plot_with_model(model, make, base_inputs, label_encoders, years_range=range(1980, 2025)): predictions = [] for year in years_range: try: current_inputs = base_inputs.copy() current_inputs['year'] = float(year) age = 2024 - year # Base value calculation base_price = 30000 # Average new car price # Depreciation curve if age <= 1: value_factor = 0.85 # 15% first year depreciation elif age <= 5: value_factor = 0.85 * (0.90 ** (age - 1)) # 10% years 2-5 else: value_factor = 0.85 * (0.90 ** 4) * (0.95 ** (age - 5)) # 5% thereafter price = base_price * value_factor predictions.append({"year": year, "predicted_price": max(price, 2000)}) # Floor of $2000 except Exception as e: continue if not predictions: return None predictions_df = pd.DataFrame(predictions) fig, ax = plt.subplots(figsize=(12, 6)) ax.plot(predictions_df["year"], predictions_df["predicted_price"], color="#FF4B4B", linewidth=2) ax.set_title(f"Average Car Value by Age") ax.set_xlabel("Year") ax.set_ylabel("Value ($)") ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'${x:,.0f}')) plt.grid(True, alpha=0.3) return fig def inspect_model_features(model): # Check feature names the model expects try: if hasattr(model, "feature_names_in_"): print("Model feature names:", model.feature_names_in_) else: print("Model does not have 'feature_names_in_' attribute.") except Exception as e: print(f"Error inspecting model features: {e}") def predict_with_ranges(inputs, model, label_encoders): input_df = prepare_input(inputs, label_encoders) base_prediction = float(np.expm1(model.predict(input_df)[0])) brand_categories = create_brand_categories() make = inputs['make'].lower() year = inputs['year'] condition = inputs['condition'] odometer = inputs['odometer'] age = 2024 - year # Find brand category and price range price_range = None for category, brands in brand_categories.items(): if make in brands: price_range = brands[make] break if not price_range: price_range = (15000, 35000) # Default range # Calculate adjustment factors mileage_factor = max(1 - (odometer / 200000) * 0.3, 0.7) age_factor = 0.85 ** min(age, 15) condition_factor = { 'new': 1.0, 'like new': 0.9, 'excellent': 0.8, 'good': 0.7, 'fair': 0.5, 'salvage': 0.3 }.get(condition, 0.7) # Apply all factors min_price = price_range[0] * mileage_factor * age_factor * condition_factor max_price = price_range[1] * mileage_factor * age_factor * condition_factor predicted_price = base_prediction * mileage_factor * age_factor * condition_factor # Use uniform distribution instead of clamping final_prediction = np.random.uniform(min_price, max_price) return { 'predicted_price': final_prediction, 'min_price': min_price, 'max_price': max_price } # --- Main Application --- def main(model, label_encoders, dataset): col1, col2 = st.columns([2, 1]) with col1: st.markdown("""

The Guide 🚗

A cutting-edge data science project leveraging machine learning to detect which car would be best for you.

""", unsafe_allow_html=True) inputs, predict_button = create_prediction_interface() if predict_button: st.write(f"Analyzing {inputs['year']} {inputs['make'].title()} {inputs['model'].title()}...") prediction_results = predict_with_ranges(inputs, model, label_encoders) st.markdown(f""" ### Price Analysis - **Estimated Range**: ${prediction_results['min_price']:,.2f} - ${prediction_results['max_price']:,.2f} - **Model Prediction**: ${prediction_results['predicted_price']:,.2f} """) # Generate and display the graph fig = create_market_trends_plot_with_model(model, inputs["make"], inputs, label_encoders) if fig: st.pyplot(fig) else: st.warning("No graph generated. Please check your data or selection.") with col2: create_assistant_section(dataset) if __name__ == "__main__": try: # Load data and model original_data = load_datasets() model, label_encoders = load_model_and_encodings() # Call the main function main(model, label_encoders, original_data) except Exception as e: st.error(f"Error loading data or models: {str(e)}") st.stop()