Spaces:
Sleeping
Sleeping
File size: 3,400 Bytes
8efe659 bd2022e c447545 13ac7c2 86fc40d bd2022e 9de97c6 bd2022e adc4ff3 bd2022e 86fc40d 2fe2a42 c447545 13ac7c2 5777a9a 9de97c6 5777a9a f9779a0 a6daf3c 86fc40d a6daf3c 86fc40d a6daf3c bd2022e a02ed2b bd2022e c447545 1efe83d 5777a9a f9779a0 9de97c6 a02ed2b bd2022e 13ac7c2 56c79b1 a02ed2b 56c79b1 a02ed2b 1efe83d a6daf3c 9de97c6 3fd92e9 9de97c6 c447545 a6daf3c d82fb4a a6daf3c d82fb4a a6daf3c 9de97c6 3fd92e9 a02ed2b 3fd92e9 c447545 3fd92e9 e8878ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
from mailparser import parse_from_file
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(file_path):
email = parse_from_file(file_path)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
def get_model(model_name: str = None, multilingual: bool = False):
if model_name is None:
model_name = "urchade/gliner_base" if not multilingual else "urchade/gliner_multilingual"
global _MODEL
if _MODEL.get(model_name) is None:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None, multilingual: bool = False) -> List[Dict[str, Union[str, list]]]:
model = get_model(model_name, multilingual=multilingual)
results = []
for sentence in sentences:
_entities = model.predict_entities(sentence, labels, threshold=threshold)
entities = [{"text": entity["text"], "label": entity["label"]} for entity in _entities]
results.extend(entities)
return results
def present(email_file, labels, multilingual=False):
email = accept_mail(email_file)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Extracted Entities": entities
}
return [email_info[key] for key in ["Subject", "From", "To", "Date"]] + [entities]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.File(label="Upload Email (.eml file)"),
gr.components.CheckboxGroup(
choices=labels,
label="Labels to Detect",
value=labels, # Default all selected
),
gr.components.Checkbox(label="Use Multilingual Model")
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Dataframe(label="Extracted Entities")
],
title="Email Info Extractor",
description="Upload an email file (.eml) to extract its details and detected entities."
)
demo.launch(share=True)
|