import numpy as np from typing import List, Tuple, Optional, AnyStr, Dict import nltk nltk.download("stopwords") nltk.download('punkt') from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords import re def preprocess_text(sentence: AnyStr) -> AnyStr: """ Function that pre-processes input text by removing special characters, hyper links, numbers and by removing stop words Parameters: sentence: str, required -> A raw string which may have stop words, special chars etc. Returns: return_txt: str -> A clean string with all aforementioned, removed. """ sentence=sentence.lower().replace('{html}',"") cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', sentence) rem_url=re.sub(r'http\S+', '',cleantext) rem_num = re.sub('[0-9]+', '', rem_url) tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(rem_num) filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')] return_txt = " ".join(filtered_words) return return_txt def image_to_np_arr(image) -> np.array: """ Function that converts a byte array image into a floating pointer numpy array. Parameters: inp_texts: List[str], required -> List of strings, containing item names of a menu in english. Returns: np.ndarray """ return np.array(image) async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]: """ Function that processes extracted text by removing numbers and special characters, and filters out text with less than 2 words. Parameters: raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text. Returns: List[AnyStr] -> A list of processed text strings. """ output_texts = [] for _, extr_text, _ in raw_extrc_text: # remove all numbers, special characters from a string prcsd_txt = preprocess_text(extr_text) if len(prcsd_txt.split(" ")) >= 2: output_texts.append(prcsd_txt) return output_texts def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]: # Define the regular expression pattern to match section names and placeholders headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"] # Function to clean the strings def clean_string(input_string): parts = input_string.split(',') cleaned_parts = [part.strip() for part in parts if part.strip()] return ', '.join(cleaned_parts) for i in range(len(gen_output)): # Find all matches matches = re.findall(header_pattern, gen_output[i]) # Since re.findall returns a list of tuples, we need to extract the first tuple if matches: result = dict(zip(headers,matches[0])) result['Major Ingredients'] = clean_string(result['Major Ingredients']) # if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this." for k in result.keys(): if len(result[k]) < 3 or any(header in result[k] for header in headers): result[k] = "Sorry, can't explain this." gen_output[i] = result else: if headers[1] in gen_output[i]: gen_output[i] = {"May contain misleading explanation": dots_pattern.sub('' , gen_output[i].split(headers[1] )[1].strip().replace('', '') ) } else: gen_output[i] = {"Sorry, can't explain this item": "NA"} gen_output[i].pop('Item Name', None) return gen_output