Spaces:
Sleeping
Sleeping
File size: 3,488 Bytes
8efe659 bd2022e c447545 13ac7c2 86fc40d bd2022e 9de97c6 bd2022e adc4ff3 bd2022e 86fc40d 2fe2a42 c447545 13ac7c2 5777a9a 9de97c6 5777a9a f9779a0 a6daf3c 86fc40d a6daf3c 86fc40d a6daf3c bd2022e c447545 1efe83d 5777a9a f9779a0 9de97c6 a6daf3c bd2022e 13ac7c2 56c79b1 9de97c6 bd2022e 56c79b1 1efe83d a6daf3c 9de97c6 3fd92e9 9de97c6 c447545 a6daf3c d82fb4a a6daf3c d82fb4a a6daf3c 9de97c6 3fd92e9 9de97c6 d82fb4a 3fd92e9 c447545 3fd92e9 e8878ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
from mailparser import parse_from_file
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(file_path):
email = parse_from_file(file_path)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
def get_model(model_name: str = None, multilingual: bool = False):
if model_name is None:
model_name = "urchade/gliner_base" if not multilingual else "urchade/gliner_multilingual"
global _MODEL
if _MODEL.get(model_name) is None:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None, multilingual: bool = False) -> List[Dict[str, Union[str, list]]]:
model = get_model(model_name, multilingual=multilingual)
results = []
for sentence in sentences:
_entities = model.predict_entities(sentence, labels, threshold=threshold)
entities = []
for entity in _entities:
entities.append(entity)
results.append({"sentence": sentence, "entities": entities})
return results
def present(email_file, labels, multilingual=False):
email = accept_mail(email_file)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
result = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Cleaned Body": further_cleaned_text,
"Extracted Entities": result
}
return [email_info[key] for key in email_info]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.File(label="Upload Email (.eml file)"),
gr.components.CheckboxGroup(
choices=labels,
label="Labels to Detect",
value=labels, # Default all selected
),
gr.components.Checkbox(label="Use Multilingual Model")
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Textbox(label="Cleaned Body"),
gr.components.JSON(label="Extracted Entities")
],
title="Email Info Extractor",
description="Upload an email file (.eml) to extract its details and detected entities."
)
demo.launch(share=True)
|