import os os.system('pip install --upgrade transformers') import nltk from transformers import VisionEncoderDecoderModel, AutoTokenizer, ViTImageProcessor, pipeline import torch from PIL import Image import streamlit as st from nltk.corpus import stopwords from io import BytesIO # os.system('pip install nltk') nltk.download('stopwords') # Load the pre-trained model model = VisionEncoderDecoderModel.from_pretrained( "SumanthKarnati/Image2Ingredients") model.eval() # Define the feature extractor feature_extractor = ViTImageProcessor.from_pretrained( 'nlpconnect/vit-gpt2-image-captioning') # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained( 'nlpconnect/vit-gpt2-image-captioning') # Set up text generation pipeline generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B') # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Transfer the model to GPU if available model = model.to(device) # Set prediction arguments max_length = 16 num_beams = 4 gen_kwargs = {"max_length": max_length, "num_beams": num_beams} # Function to predict ingredients from images def predict_step(image_files, model, feature_extractor, tokenizer, device, gen_kwargs): images = [] for image_file in image_files: if image_file is not None: # Create a BytesIO object from the UploadedFile (image_file) byte_stream = BytesIO(image_file.getvalue()) image = Image.open(byte_stream) if image.mode != "RGB": image = image.convert(mode="RGB") images.append(image) if not images: return None inputs = feature_extractor(images=images, return_tensors="pt") inputs.to(device) output_ids = model.generate(inputs["pixel_values"], **gen_kwargs) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds # Get the list of English stop words stop_words = set(stopwords.words('english')) # Function to remove stop words from a list of words def remove_stop_words(word_list): return [word for word in word_list if word not in stop_words] # Streamlit app code def main(): st.title("Image2Nutrients: Food Ingredient Recognition") st.write("Upload an image of your food to recognize the ingredients!") # File upload uploaded_file = st.file_uploader( "Choose an image", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: # Display the uploaded image image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) # Perform ingredient recognition preds = predict_step([uploaded_file], model, feature_extractor, tokenizer, device, gen_kwargs) preds = preds[0].split('-') # remove numbers preds = [x for x in preds if not any(c.isdigit() for c in x)] # remove empty strings preds = list(filter(None, preds)) # remove duplicates preds = list(dict.fromkeys(preds)) preds = remove_stop_words(preds) # Display the recognized ingredients st.subheader("Recognized Ingredients:") for ingredient in preds: st.write(ingredient) preds_str = ', '.join(preds) # Prepare the prompt prompt = f"You are a knowledgeable assistant that provides nutritional advice based on a list of ingredients. The identified ingredients are: {preds_str}. Note that some ingredients may not make sense, so use the ones that do. Can you provide a nutritional analysis and suggestions for improvement?" # Generate a sequence of text suggestions = generator(prompt, do_sample=True, min_length=200) # Extract the generated text suggestions = suggestions[0]['generated_text'][len(prompt):] st.subheader("Nutritional Analysis and Suggestions:") st.write(suggestions) if __name__ == "__main__": main()