Spaces:

AmithAdiraju1694
/

translatemyimage-beta

Paused

File size: 3,991 Bytes


import numpy as np
from typing import List, Tuple, Optional, AnyStr, Dict
import nltk
nltk.download("stopwords")
nltk.download('punkt')

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re


def preprocess_text(sentence: AnyStr) -> AnyStr:

    """ 
    Function that pre-processes input text by removing special characters, hyper links,
    numbers and by removing stop words

    Parameters:
        sentence: str, required -> A raw string which may have stop words, special chars etc.

    Returns:
        return_txt: str -> A clean string with all aforementioned, removed.
    """

    sentence=sentence.lower().replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)

    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')

    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    
    return_txt = " ".join(filtered_words)

    return return_txt

def image_to_np_arr(image) -> np.array:

    """ 
    Function that converts a byte array image into a floating pointer numpy array.

    Parameters:
        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.

    Returns:
        np.ndarray
    """

    return np.array(image)

async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
    """
    Function that processes extracted text by removing numbers and special characters,
    and filters out text with less than 2 words.

    Parameters:
        raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text.

    Returns:
        List[AnyStr] -> A list of processed text strings.
    """
    output_texts = []
    for _, extr_text, _ in raw_extrc_text:
        # remove all numbers, special characters from a string
        prcsd_txt = preprocess_text(extr_text)
        if len(prcsd_txt.split(" ")) >= 2:
            output_texts.append(prcsd_txt)
    
    return output_texts

def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]:
    
    # Define the regular expression pattern to match section names and placeholders
    headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"]

    # Function to clean the strings
    def clean_string(input_string):
        parts = input_string.split(',')
        cleaned_parts = [part.strip() for part in parts if part.strip()]
        return ', '.join(cleaned_parts)

    for i in range(len(gen_output)):
        # Find all matches
        matches = re.findall(header_pattern, gen_output[i])

        # Since re.findall returns a list of tuples, we need to extract the first tuple
        if matches:
            result = dict(zip(headers,matches[0]))
            result['Major Ingredients'] = clean_string(result['Major Ingredients'])

            # if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this."
            for k in result.keys():
                if len(result[k]) < 3 or any(header in result[k] for header in headers):
                    result[k] = "Sorry, can't explain this."

            gen_output[i] = result

        else:
            if headers[1] in gen_output[i]:
                
                gen_output[i] = {"May contain misleading explanation": 
                                 dots_pattern.sub('' , 
                                                  gen_output[i].split(headers[1]
                                                   )[1].strip().replace('</s>', '') 
                                                  )
                             }
            else:
                gen_output[i] = {"Sorry, can't explain this item": "NA"}

        gen_output[i].pop('Item Name', None)
    return gen_output