Spaces:
Sleeping
Sleeping
File size: 4,380 Bytes
8efe659 bd2022e c447545 13ac7c2 86fc40d bd2022e 1afbc3a bd2022e 9de97c6 bd2022e adc4ff3 bd2022e 1afbc3a 86fc40d 2fe2a42 c447545 13ac7c2 5777a9a 9de97c6 5777a9a f9779a0 a6daf3c 86fc40d a6daf3c 9fe2871 86fc40d 9fe2871 86fc40d 9fe2871 bd2022e 9fe2871 bd2022e 9fe2871 bd2022e 9fe2871 b10c920 1afbc3a c447545 1efe83d 5777a9a f9779a0 9fe2871 a02ed2b 9fe2871 832bd14 13ac7c2 56c79b1 832bd14 56c79b1 0f44836 1efe83d a6daf3c 9de97c6 3fd92e9 9fe2871 9de97c6 c447545 9fe2871 a6daf3c 9de97c6 3fd92e9 1afbc3a 3fd92e9 c447545 3fd92e9 e8878ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import gradio as gr
from mailparser import parse_from_file
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
from transformers import T5Tokenizer, T5ForConditionalGeneration
import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(file_path):
email = parse_from_file(file_path)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
def get_model(model_name: str = None, multilingual: bool = False):
if model_name is None:
model_name = "urchade/gliner_base" if not multilingual else "urchade/gliner_multilingual"
global _MODEL
if _MODEL.get(model_name) is None:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None, multilingual: bool = False) -> List[Dict[str, Union[str, list]]]:
model = get_model(model_name, multilingual=multilingual)
results = []
for sentence in sentences:
_entities = model.predict_entities(sentence, labels, threshold=threshold)
entities = [{"text": entity["text"], "label": entity["label"]} for entity in _entities]
results.extend(entities)
return results
def refine_entities_with_t5(entities):
inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
outputs = t5_model.generate(input_ids)
result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
def present(email_file, labels, multilingual=False):
email = accept_mail(email_file)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
# Format entities for DataFrame: Convert list of dicts to list of lists
entities_data = [[entity['text'], entity['label']] for entity in entities]
refined_entities = refine_entities_with_t5(entities)
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Extracted Entities": entities_data, # Adjusted for DataFrame
"Refined Entities": refined_entities
}
return [email_info["Subject"], email_info["From"], email_info["To"], email_info["Date"], entities_data, refined_entities]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.File(label="Upload Email (.eml file)"),
gr.components.CheckboxGroup(
choices=labels,
label="Labels to Detect",
value=labels, # Default all selected
),
gr.components.Checkbox(label="Use Multilingual Model")
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
gr.components.Textbox(label="Refined Entities")
],
title="Email Info Extractor",
description="Upload an email file (.eml) to extract its details and detected entities."
)
demo.launch(share=True)
|