File size: 3,991 Bytes
9a0f501
 
2a12b77
9a0f501
 
 
 
 
 
 
 
 
 
11b899a
 
 
 
 
 
 
 
 
 
 
 
9a0f501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11b899a
 
 
 
 
 
 
 
 
 
 
9a0f501
 
11b899a
2a12b77
 
 
 
 
 
 
 
 
 
9a0f501
 
 
 
2a12b77
 
9a0f501
2a12b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

import numpy as np
from typing import List, Tuple, Optional, AnyStr, Dict
import nltk
nltk.download("stopwords")
nltk.download('punkt')

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re


def preprocess_text(sentence: AnyStr) -> AnyStr:

    """ 
    Function that pre-processes input text by removing special characters, hyper links,
    numbers and by removing stop words

    Parameters:
        sentence: str, required -> A raw string which may have stop words, special chars etc.

    Returns:
        return_txt: str -> A clean string with all aforementioned, removed.
    """

    sentence=sentence.lower().replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)

    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')

    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    
    return_txt = " ".join(filtered_words)

    return return_txt

def image_to_np_arr(image) -> np.array:

    """ 
    Function that converts a byte array image into a floating pointer numpy array.

    Parameters:
        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.

    Returns:
        np.ndarray
    """

    return np.array(image)

async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
    """
    Function that processes extracted text by removing numbers and special characters,
    and filters out text with less than 2 words.

    Parameters:
        raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text.

    Returns:
        List[AnyStr] -> A list of processed text strings.
    """
    output_texts = []
    for _, extr_text, _ in raw_extrc_text:
        # remove all numbers, special characters from a string
        prcsd_txt = preprocess_text(extr_text)
        if len(prcsd_txt.split(" ")) >= 2:
            output_texts.append(prcsd_txt)
    
    return output_texts

def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]:
    
    # Define the regular expression pattern to match section names and placeholders
    headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"]

    # Function to clean the strings
    def clean_string(input_string):
        parts = input_string.split(',')
        cleaned_parts = [part.strip() for part in parts if part.strip()]
        return ', '.join(cleaned_parts)

    for i in range(len(gen_output)):
        # Find all matches
        matches = re.findall(header_pattern, gen_output[i])

        # Since re.findall returns a list of tuples, we need to extract the first tuple
        if matches:
            result = dict(zip(headers,matches[0]))
            result['Major Ingredients'] = clean_string(result['Major Ingredients'])

            # if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this."
            for k in result.keys():
                if len(result[k]) < 3 or any(header in result[k] for header in headers):
                    result[k] = "Sorry, can't explain this."

            gen_output[i] = result

        else:
            if headers[1] in gen_output[i]:
                
                gen_output[i] = {"May contain misleading explanation": 
                                 dots_pattern.sub('' , 
                                                  gen_output[i].split(headers[1]
                                                   )[1].strip().replace('</s>', '') 
                                                  )
                             }
            else:
                gen_output[i] = {"Sorry, can't explain this item": "NA"}

        gen_output[i].pop('Item Name', None)
    return gen_output