Spaces:

AmithAdiraju1694
/

translatemyimage-beta

Paused

Amith Adiraju commited on Jan 1

Commit

5a29f4a

1 Parent(s): 11b899a

1. Added custom fine tuned model to provide item explanations is specific format.

2. Provided capability to enter menu items manually than uploading an image.
3. Created multiple pages and redirected code accordingly.
4. Added robust regular expressions and other techniques to post process outputs to user.

Signed-off-by: Amith Adiraju <[email protected]>

Files changed (6) hide show

app.py +37 -177
inference/config.py +16 -26
inference/preprocess_image.py +57 -4
inference/translate.py +41 -16
pages.py +214 -0
utils.py +15 -0

app.py CHANGED Viewed

@@ -1,204 +1,64 @@
 import streamlit as st
 from streamlit import session_state as sst
-from typing import List, Optional
 import asyncio
-import pandas as pd
-from inference.translate import (
-    extract_filter_img,
-    transcribe_menu_model
-)
-from inference.config import DEBUG_MODE
-from PIL import Image
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import os
-# Setting workers to be 70% of all available virtual cpus in system
-cpu_count = os.cpu_count()
-pool = ThreadPoolExecutor(max_workers=int(cpu_count*0.7) )
 # Initialize session state variable to start with home page
 if "page" not in sst:
     sst["page"] = "Home"
-def navigate_to(page: str) -> None:
-    """
-    Function to set the current page in the state of streamlit. A helper for
-    simulating navigation in streamlit.
-    Parameters:
-        page: str, required.
-    Returns:
-        None
-    """
-    sst["page"] = page
-async def main_page() -> None:
-    """
-    Function that contains content of main page i.e., image uploader and submit button to navigate to next page.
-    Upon submit , control goes to model inference 'page'.
-    Parameters:
-        None
-    Returns:
-        None
-    """
-    # Streamlit app
-    first_title = st.empty()
-    first_title.title("App that explains your menu items ")
-    # Streamlit function to upload an image from any device
-    uploaded_file = st.file_uploader("Choose an image...",
-                                 type=["jpg", "jpeg", "png"])
-    # Remove preivous states' value of input image if it exists
-    sst.pop('input_image', None)
-    # Submit button
-    if uploaded_file is not None:
-        image = Image.open(uploaded_file)
-        # Only show if user wants to see
-        if st.checkbox('Show Uploaded Image'):
-            st.image(image,
-                    caption='Uploaded Image',
-                    use_column_width=True)
-        sst["input_image"] = image
-        # Submit button
-        st.button("Submit",
-                  on_click = navigate_to,
-                  args = ("Inference",))
-        st.info("""This application is for education purposes only. It uses AI, hence it's dietary
-                    recommendations are not to be taken as medical advice, author doesn't bear responsibility
-                    for incorrect dietary recommendations. Please proceed with caution.
-                    """)
-async def dist_llm_inference(inp_texts: List[str]) -> None:
-    """
-    Function that performs concurrent LLM inference using threadpool. It displays
-    results of those threads that are done with execution, as a dynamic row to streamlit table, rather than
-    waiting for all threads to be done.
-    Parameters:
-        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.
-    Returns:
-        None
-    """
-    df = pd.DataFrame([('ITEM NAME', 'EXPLANATION')]
-                     )
-    sl_table = st.table(df)
-    tp_futures = { pool.submit(transcribe_menu_model, mi): mi for mi in inp_texts }
-    for tpftr in as_completed(tp_futures):
-        item = tp_futures[tpftr]
-        try:
-            exp = tpftr.result()
-            sl_table.add_rows([(item,exp)] )
-        except Exception as e:
-            print("Could not add a new row dynamically, because of this error:", e)
-    return
-async def model_inference():
     """
-    Function that pre-processes input text from state variables, does concurrent inference
-    and toggles state between pages if needed.
-    Parameters:
-        None
     Returns:
         None
     """
-    second_title = st.empty()
-    second_title.title(" Using ML to explain your menu items ... ")
-    if "input_image" in sst:
-        image = sst["input_image"]
-        msg1 = st.empty()
-        msg1.write("Pre-processing and extracting text out of your image ....")
-        st_filter = time.perf_counter()
-        # Call the extract_filter_img function
-        filtered_text = await extract_filter_img(image)
-        en_filter = time.perf_counter()
-        num_items_detected = len(filtered_text)
-        if num_items_detected == 0:
-            st.write("We couldn't detect any menu items ( indian for now ) from your image, please try a different image.")
-        elif num_items_detected > 0:
-            st.write(f"Detected {num_items_detected} menu items from your input image ... ")
-            msg2 = st.empty()
-            msg2.write("All pre-processing done, transcribing your menu items now ....")
-            st_trans_llm = time.perf_counter()
-            await dist_llm_inference(filtered_text)
-            msg3 = st.empty()
-            msg3.write("Done transcribing ... ")
-            en_trans_llm = time.perf_counter()
-            msg1.empty(); msg2.empty(); msg3.empty()
-            st.success("Image processed successfully! " )
-            if DEBUG_MODE:
-                filter_time_sec = en_filter - st_filter
-                llm_time_sec = en_trans_llm - st_trans_llm
-                total_time_sec = filter_time_sec + llm_time_sec
-                st.write("Time took to extract and filter text {}".format(filter_time_sec))
-                st.write("Time took to summarize by LLM {}".format(llm_time_sec))
-                st.write('Overall time taken in seconds: {}'.format(total_time_sec))
-            st.button("translate another",
-                      on_click=navigate_to,
-                      args=("Home",))
-    else:
-        st.write("Looks like image upload failed, please try uploading it again ... ")
-async def main():
-    """
-    Function that toggles between pages based on state variables.
-    Parameters:
-        None
-    Returns:
-        None
-    """
-    if sst["page"] == "Home":
-        await main_page()
     elif sst["page"] == "Inference":
-        await model_inference()
 asyncio.run(main())

+from utils import navigate_to
+from pages import manual_input_page, image_input_page, model_inference_page
 import streamlit as st
 from streamlit import session_state as sst
 import asyncio
+#TODO: Fix model inference and post processing function befor emoving ot production.
 # Initialize session state variable to start with home page
 if "page" not in sst:
     sst["page"] = "Home"
+# function to remove all sesion variables from sst, except page.
+def reset_sst():
+    for key in list(sst.keys()):
+        if key != "page":
+            sst.pop(key, None)
+# Landing page function
+async def landing_page():
+    st.title("We will explain your menu like never before!")
+    st.write("\n")
+    st.write("\n")
+    st.write("\n")
+    c1, c2= st.columns(2)
+    with c1:
+        # Navigate to manual input page if user clicks on the button
+        st.button("Enter Items Manually", on_click=navigate_to, args=("ManualInput",))
+    with c2:
+        # Navigate to image input page if user clicks on the button
+        st.button("Upload Items from Image", on_click=navigate_to, args=("ImageInput",))
+# Main function to handle navigation
+async def main():
     """
+    Main function that handles the navigation logic based on the current page.
     Returns:
         None
     """
+    # Navigation logic
+    if sst["page"] == "Home":
+        reset_sst() # reset all session state variables before navigating to the landing page
+        await landing_page()  # Call the landing page function
+    elif sst["page"] == "ManualInput":
+        reset_sst() # reset all session state variables before navigating to the landing page
+        await manual_input_page()  # Call the manual input page function
+    elif sst["page"] == "ImageInput":
+        reset_sst() # reset all session state variables before navigating to the landing page
+        await image_input_page()  # Call the image input page function
     elif sst["page"] == "Inference":
+        await model_inference_page()  # Call the model inference page function
 asyncio.run(main())

inference/config.py CHANGED Viewed

@@ -1,33 +1,23 @@
-INSTRUCTION_PROMPT = """
-The following text contains examples of three items and their corresponding explanations in the required format.\n
-Item -> palak paneer.\n
-Explanation -> Major Ingredients here: paneer ( a.k.a cottage cheese ) , palak ( spinach ).\n
-How it is made: It's a savory item, made like a gravy; usually made by sauteing spices and mixing saute with boiled paneer and palak.\n
-It goes well with: White basmati rice or Indian flat bread.\n
-Allergens: Paneer may cause digestive discomfort and intolerance to some.\n
-Food Category: Vegetarian, Vegans may not like it, as paneer is usually made from cow milk.
-Item -> rumali roti.\n
-Explanation -> Major Ingredients here: roti.\n
-How it is made: A small soft bread, made to size of a napkin ( a.k.a 'rumal' in hindi ); usually made with a combination of whole wheat and all purpose flour.\n
-It goes well with: Most indian gravies such as palak paneer, tomato curry etc.\n
-Allergens: May contain gluten, which is known to cause digestive discomfort and intolerance to some.\n
-Food Category: Vegetarian, Vegan.
-Item -> nizami handi.\n
-Explanation -> Major Ingredients here: Different veggies, makhani sauce (skimmed milk, tomato and cashew paste , indian spices), combination of nuts.\n
-How it is made: Makhani sauce is added to onion-tomato based paste and bought to a boil; a Medley of veggies and gently flavored whole spices are added and boiled for small time.\n
-It goes well with: Different kinds of indian flat breads, white basmati and sonamasoori rice.\n
-Allergens: Presence of nuts, butter cream and makhani sauce are known to cause digestive discomfort and intolerance to some.\n
-Food Category: Usually vegetarian, may include chicken or animal meat sometimes, please check with hotel.
-Based on Item and explanation pairs provided above, provide similar explanation ('Major Ingredients', 'How is it made', 'It goes well with', 'Allergens' and 'Food Category') to the below item.\n
-Item ->
-"""
-DEBUG_MODE = False
-DEVICE = 'cpu'

+import torch
+import re
+model_inf_inp_prompt = "INSTRUCTION: given food item name, explain these things:(major ingredients,making process,portion & spicy/sweet,pairs with,allergens,food type(veg/non-veg/vegan)). ensure to get allergens and food category factually correct.Item Name: {} "
+header_pattern = r'Item Name: (.*?)\. Major Ingredients: (.*?)\. Making Process: (.*?)\. Portion and Spice Level: (.*?)\. Pairs With: (.*?)\. Allergens: (.*?)\. Food Type: (.*?)\.\s*</s>'
+dots_pattern = re.compile(r'\.{3,}')
+DEBUG_MODE = True
+model_name = "AmithAdiraju1694/gpt-neo-125M_menuitemexp"
+def get_device():
+  if torch.cuda.is_available():
+    device = torch.device("cuda")
+    print(f"Using GPU: {torch.cuda.get_device_name(0)}") #get the name of the GPU being used.
+  else:
+      device = torch.device("cpu")
+      print("Using CPU")
+  return device
+DEVICE = get_device()

inference/preprocess_image.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
-from typing import List, Tuple, Optional, AnyStr
 import nltk
 nltk.download("stopwords")
 nltk.download('punkt')
@@ -53,11 +53,64 @@ def image_to_np_arr(image) -> np.array:
     return np.array(image)
 async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
     output_texts = []
     for _, extr_text, _ in raw_extrc_text:
         # remove all numbers, special characters from a string
         prcsd_txt = preprocess_text(extr_text)
-        if len(prcsd_txt.split(" ") ) >= 2: output_texts.append(prcsd_txt)
-    return output_texts

 import numpy as np
+from typing import List, Tuple, Optional, AnyStr, Dict
 import nltk
 nltk.download("stopwords")
 nltk.download('punkt')
     return np.array(image)
 async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
+    """
+    Function that processes extracted text by removing numbers and special characters,
+    and filters out text with less than 2 words.
+    Parameters:
+        raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text.
+    Returns:
+        List[AnyStr] -> A list of processed text strings.
+    """
     output_texts = []
     for _, extr_text, _ in raw_extrc_text:
         # remove all numbers, special characters from a string
         prcsd_txt = preprocess_text(extr_text)
+        if len(prcsd_txt.split(" ")) >= 2:
+            output_texts.append(prcsd_txt)
+    return output_texts
+def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]:
+    # Define the regular expression pattern to match section names and placeholders
+    headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"]
+    # Function to clean the strings
+    def clean_string(input_string):
+        parts = input_string.split(',')
+        cleaned_parts = [part.strip() for part in parts if part.strip()]
+        return ', '.join(cleaned_parts)
+    for i in range(len(gen_output)):
+        # Find all matches
+        matches = re.findall(header_pattern, gen_output[i])
+        # Since re.findall returns a list of tuples, we need to extract the first tuple
+        if matches:
+            result = dict(zip(headers,matches[0]))
+            result['Major Ingredients'] = clean_string(result['Major Ingredients'])
+            # if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this."
+            for k in result.keys():
+                if len(result[k]) < 3 or any(header in result[k] for header in headers):
+                    result[k] = "Sorry, can't explain this."
+            gen_output[i] = result
+        else:
+            if headers[1] in gen_output[i]:
+                gen_output[i] = {"May contain misleading explanation":
+                                 dots_pattern.sub('' ,
+                                                  gen_output[i].split(headers[1]
+                                                   )[1].strip().replace('</s>', '')
+                                                  )
+                             }
+            else:
+                gen_output[i] = {"Sorry, can't explain this item": "NA"}
+        gen_output[i].pop('Item Name', None)
+    return gen_output

inference/translate.py CHANGED Viewed

@@ -2,29 +2,50 @@ import streamlit as st
 from inference.preprocess_image import (
     image_to_np_arr,
-    process_extracted_text
 )
-from inference.config import INSTRUCTION_PROMPT, DEVICE
 from typing import List, Tuple, Optional, AnyStr, Dict
-from transformers import T5Tokenizer, T5ForConditionalGeneration
 import easyocr
 import time
 use_gpu = True
-if DEVICE == 'cpu': use_gpu = False
 @st.cache_resource
 def load_models(item_summarizer: AnyStr) -> Tuple:
     text_extractor = easyocr.Reader(['en'],
                                     gpu = use_gpu
                                     )
-    tokenizer = T5Tokenizer.from_pretrained(item_summarizer)
-    model = T5ForConditionalGeneration.from_pretrained(item_summarizer)
     return (text_extractor, tokenizer, model)
-text_extractor,item_tokenizer,item_summarizer = load_models(item_summarizer = "google/flan-t5-large")
 # Define your extract_filter_img function
@@ -78,20 +99,24 @@ async def extract_filter_img(image) -> Dict:
 def transcribe_menu_model(menu_text: List[AnyStr]) -> Dict:
-    prompt_item = INSTRUCTION_PROMPT + " " + menu_text + """
-"""
     input_ids = item_tokenizer(prompt_item, return_tensors="pt").input_ids
     outputs = item_summarizer.generate(input_ids,
-                                        max_new_tokens = 512
                                         )
-    return item_tokenizer.decode(
-        outputs[0],
-        skip_special_tokens = True
-        )
 def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
     return extrc_str

 from inference.preprocess_image import (
     image_to_np_arr,
+    process_extracted_text,
+    post_process_gen_outputs
 )
+from inference.config import (
+     model_inf_inp_prompt,
+    header_pattern,
+    dots_pattern,
+    DEVICE,
+    model_name
+                             )
 from typing import List, Tuple, Optional, AnyStr, Dict
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import easyocr
 import time
 use_gpu = True
+if DEVICE.type == 'cpu': use_gpu = False
 @st.cache_resource
 def load_models(item_summarizer: AnyStr) -> Tuple:
+    """
+    Function to load the models required for the inference process. Cached to avoid loading the models, every time the function is called.
+    Parameters:
+        item_summarizer: str, required -> The LLM model name to be used for item summarization.
+    Returns:
+        Tuple -> Tuple containing the required models for the inference process.
+    """
+    # model to extract text from image
     text_extractor = easyocr.Reader(['en'],
                                     gpu = use_gpu
                                     )
+    # tokenizer and model to generate item summary
+    tokenizer = AutoTokenizer.from_pretrained(item_summarizer)
+    model = AutoModelForCausalLM.from_pretrained(item_summarizer)
     return (text_extractor, tokenizer, model)
+text_extractor,item_tokenizer,item_summarizer = load_models(item_summarizer = model_name)
 # Define your extract_filter_img function
 def transcribe_menu_model(menu_text: List[AnyStr]) -> Dict:
+    prompt_item = model_inf_inp_prompt.format(menu_text)
     input_ids = item_tokenizer(prompt_item, return_tensors="pt").input_ids
     outputs = item_summarizer.generate(input_ids,
+                                       max_new_tokens = 512,
+                                       num_beams = 4,
+                                       pad_token_id = item_tokenizer.pad_token_id,
+                                       eos_token_id = item_tokenizer.eos_token_id,
+                                       bos_token_id = item_tokenizer.bos_token_id
+                                       )
+    prediction = item_tokenizer.batch_decode(outputs,
+                                        skip_special_tokens=False
                                         )
+    postpro_output = post_process_gen_outputs( prediction, header_pattern, dots_pattern )[0]
+    return postpro_output
 def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
     return extrc_str

pages.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import streamlit as st
+from streamlit import session_state as sst
+from utils import navigate_to
+from inference.config import DEBUG_MODE
+from inference.translate import extract_filter_img, transcribe_menu_model,classify_menu_text
+from inference.preprocess_image import preprocess_text
+import os
+import time
+import pandas as pd
+from PIL import Image
+from typing import List
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Setting workers to be 70% of all available virtual cpus in system
+cpu_count = os.cpu_count()
+pool = ThreadPoolExecutor(max_workers=int(cpu_count*0.7) )
+# Function that handles logic of explaining menu items from manual input
+async def manual_input_page():
+    """
+    Function that takes text input from user in input box of streamlit, user can add multiple text boxes and submit finally.
+    Parameters:
+        None
+    Returns:
+        List[str]: List of strings, containing item names of a menu in english.
+    """
+    st.write("This is the Manual Input Page.")
+    st.write("Once done, click on 'Explain My Menu' button to get explanations for each item ... ")
+    inp_texts = []
+    num_text_boxes = st.number_input("Number of text boxes", min_value=1, step=1)
+    for i in range(num_text_boxes):
+        text_box = st.text_input(f"Food item {i+1}")
+        if text_box:
+            inp_texts.append(text_box)
+    if len(inp_texts) > 0:
+        # Show user submit button only if they have entered some text and set text in session state
+        sst["user_entered_items"] = inp_texts
+        st.button("Explain My Menu",on_click=navigate_to,args=("Inference",))
+    else:
+        st.write("Please enter some items to proceed ...")
+    st.button("Go back Home", on_click=navigate_to, args=("Home",))
+# Function that handles logic of explaining menu items from image uploads
+async def image_input_page():
+    """
+    Function that contains content of main page i.e., image uploader and submit button to navigate to next page.
+    Upon submit , control goes to model inference 'page'.
+    Parameters:
+        None
+    Returns:
+        None
+    """
+    st.write("This is the Image Input Page.")
+    # Streamlit function to upload an image from any device
+    uploaded_file = st.file_uploader("Choose an image...",
+                                 type=["jpg", "jpeg", "png"])
+    # Remove preivous states' value of input image if it exists
+    sst.pop('input_image', None)
+    # Submit button
+    if uploaded_file is not None:
+        image = Image.open(uploaded_file)
+        # Only show if user wants to see
+        if st.checkbox('Show Uploaded Image'):
+            st.image(image,
+                    caption='Uploaded Image',
+                    use_column_width=True)
+        sst["input_image"] = image
+        # Show user submit button only if they have uploaded an image
+        st.button("Translate My Menu",
+                  on_click = navigate_to,
+                  args = ("Inference",))
+        # Warning message to user
+        st.info("""This application is for education purposes only. It uses AI, hence it's dietary
+                    recommendations are not to be taken as medical advice, author doesn't bear responsibility
+                    for incorrect dietary recommendations. Please proceed with caution.
+                    """)
+    # if user wants to go back, make sure to reset the session state
+    st.button("Go back Home", on_click=navigate_to, args=("Home",))
+# Function that handles model inference
+async def model_inference_page():
+    """
+    Function that pre-processes input text from state variables, does concurrent inference
+    and toggles state between pages if needed.
+    Parameters:
+        None
+    Returns:
+        None
+    """
+    second_title = st.empty()
+    second_title.title(" Using ML to explain your menu items ... ")
+    # User can either upload an image or enter text manually, we check for both
+    if "input_image" in sst:
+        image = sst["input_image"]
+        msg1 = st.empty()
+        msg1.write("Pre-processing and extracting text out of your image ....")
+        # Call the extract_filter_img function
+        filtered_text = await extract_filter_img(image)
+        num_items_detected = len(filtered_text)
+    if "user_entered_items" in sst:
+        user_text = sst["user_entered_items"]
+        st.write("Pre-processing and filtering text from user input ....")
+        filtered_text = [preprocess_text(ut) for ut in user_text]
+        num_items_detected = len(filtered_text)
+    # irrespective of source of user entry , we check if we have any items to process
+    if num_items_detected == 0:
+        st.write("We couldn't detect any menu items ( indian for now ) from your image, please try a different image by going back.")
+    elif num_items_detected > 0:
+        st.write(f"Detected {num_items_detected} menu items from your input image ... ")
+        msg2 = st.empty()
+        msg2.write("All pre-processing done, transcribing your menu items now ....")
+        st_trans_llm = time.perf_counter()
+        await dist_llm_inference(filtered_text)
+        msg3 = st.empty()
+        msg3.write("Done transcribing ... ")
+        en_trans_llm = time.perf_counter()
+        msg2.empty(); msg3.empty()
+        st.success("Image processed successfully! " )
+        # Some basic stats for debug mode
+        if DEBUG_MODE:
+            llm_time_sec = en_trans_llm - st_trans_llm
+            st.write("Time took to summarize by LLM {}".format(llm_time_sec))
+    # If user clicked in "translate_another" button reset all session state variables and go back to home
+    st.button("Go back Home", on_click=navigate_to, args=("Home",))
+# Function that performs LLM inference on a single item
+async def dist_llm_inference(inp_texts: List[str]) -> None:
+    """
+    Function that performs concurrent LLM inference using threadpool. It displays
+    results of those threads that are done with execution, as a dynamic row to streamlit table, rather than
+    waiting for all threads to be done.
+    Parameters:
+        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.
+    Returns:
+        None
+    """
+    df = pd.DataFrame([('ITEM NAME', 'EXPLANATION')]
+                     )
+    sl_table = st.table(df)
+    tp_futures = { pool.submit(transcribe_menu_model, mi): mi for mi in inp_texts }
+    for tpftr in as_completed(tp_futures):
+        item = tp_futures[tpftr]
+        try:
+            exp = tpftr.result()
+            sl_table.add_rows([(item,
+                                str(exp ))
+                                ]
+                                )
+        except Exception as e:
+            print("Could not add a new row dynamically, because of this error:", e)
+    return

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from streamlit import session_state as sst
+def navigate_to(page: str) -> None:
+    """
+    Function to set the current page in the state of streamlit. A helper for
+    simulating navigation in streamlit.
+    Parameters:
+        page: str, required.
+    Returns:
+        None
+    """
+    sst["page"] = page