File size: 4,380 Bytes
8efe659
bd2022e
c447545
13ac7c2
86fc40d
bd2022e
1afbc3a
bd2022e
9de97c6
bd2022e
 
adc4ff3
 
bd2022e
1afbc3a
 
 
86fc40d
 
2fe2a42
c447545
 
13ac7c2
 
 
 
 
 
 
 
 
5777a9a
9de97c6
5777a9a
 
 
f9779a0
 
 
 
a6daf3c
 
86fc40d
a6daf3c
9fe2871
 
 
 
86fc40d
9fe2871
86fc40d
 
9fe2871
 
 
bd2022e
9fe2871
bd2022e
 
9fe2871
 
bd2022e
9fe2871
b10c920
1afbc3a
 
 
 
 
 
 
c447545
 
1efe83d
5777a9a
f9779a0
9fe2871
a02ed2b
9fe2871
 
 
 
832bd14
 
13ac7c2
56c79b1
 
 
 
832bd14
 
56c79b1
0f44836
1efe83d
a6daf3c
9de97c6
3fd92e9
9fe2871
9de97c6
c447545
9fe2871
 
 
 
 
a6daf3c
9de97c6
3fd92e9
 
 
 
 
1afbc3a
 
3fd92e9
c447545
 
3fd92e9
e8878ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr

from mailparser import parse_from_file
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
from transformers import T5Tokenizer, T5ForConditionalGeneration

import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()

t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)

def accept_mail(file_path):
    email = parse_from_file(file_path)
    return email

def clean_email(email):
    soup = BeautifulSoup(email.body, 'html.parser')
    for tag in soup.find_all(['style', 'link']):
        tag.decompose()
    cleaned_text = ' '.join(soup.get_text(separator=' ').split())
    return cleaned_text

def remove_special_characters(text):
    pattern = r'[=_-]+'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def get_sentences(further_cleaned_text):
    doc = nlp(further_cleaned_text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

def get_model(model_name: str = None, multilingual: bool = False):
    if model_name is None:
        model_name = "urchade/gliner_base" if not multilingual else "urchade/gliner_multilingual"

    global _MODEL

    if _MODEL.get(model_name) is None:
        _MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)

    return _MODEL[model_name]

def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None, multilingual: bool = False) -> List[Dict[str, Union[str, list]]]:
    model = get_model(model_name, multilingual=multilingual)

    results = []

    for sentence in sentences:
        _entities = model.predict_entities(sentence, labels, threshold=threshold)
        entities = [{"text": entity["text"], "label": entity["label"]} for entity in _entities]
        results.extend(entities)

    return results

def refine_entities_with_t5(entities):
    inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
    input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
    outputs = t5_model.generate(input_ids)
    result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

def present(email_file, labels, multilingual=False):
    email = accept_mail(email_file)
    cleaned_text = clean_email(email)
    further_cleaned_text = remove_special_characters(cleaned_text)
    sentence_list = get_sentences(further_cleaned_text)
    
    entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
    
    # Format entities for DataFrame: Convert list of dicts to list of lists
    entities_data = [[entity['text'], entity['label']] for entity in entities]

    refined_entities = refine_entities_with_t5(entities)

    email_info = {
        "Subject": email.subject,
        "From": email.from_,
        "To": email.to,
        "Date": email.date,
        "Extracted Entities": entities_data,  # Adjusted for DataFrame
        "Refined Entities": refined_entities
    }
    return [email_info["Subject"], email_info["From"], email_info["To"], email_info["Date"], entities_data, refined_entities]

labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]

demo = gr.Interface(
    fn=present, 
    inputs=[
        gr.components.File(label="Upload Email (.eml file)"),
        gr.components.CheckboxGroup(
            choices=labels,
            label="Labels to Detect",
            value=labels,  # Default all selected
        ),
        gr.components.Checkbox(label="Use Multilingual Model")
    ],
    outputs=[
        gr.components.Textbox(label="Subject"),
        gr.components.Textbox(label="From"),
        gr.components.Textbox(label="To"),
        gr.components.Textbox(label="Date"),
        gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
        gr.components.Textbox(label="Refined Entities")
    ],
    title="Email Info Extractor",
    description="Upload an email file (.eml) to extract its details and detected entities."
)
demo.launch(share=True)