Spaces:
Sleeping
Sleeping
File size: 2,764 Bytes
8efe659 5777a9a 86fc40d 2fe2a42 13ac7c2 86fc40d 9de97c6 86fc40d 2fe2a42 3fd92e9 13ac7c2 5777a9a 9de97c6 5777a9a f9779a0 9de97c6 86fc40d 9de97c6 28ca0f2 1efe83d 5777a9a f9779a0 9de97c6 13ac7c2 56c79b1 9de97c6 f9779a0 56c79b1 1efe83d f9779a0 9de97c6 3fd92e9 9de97c6 3fd92e9 9de97c6 f9779a0 3fd92e9 9de97c6 3fd92e9 56c79b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
import re
import os
from mailparser import parse_from_string
from bs4 import BeautifulSoup
from gliner import GLiNER
import spacy
nlp = spacy.load("en_core_web_sm")
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(email_content):
email = parse_from_string(email_content)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
# doc = nlp(text)
# entities = []
# for ent in doc.ents:
# if ent.label_ in labels:
# entities.append((ent.text, ent.label_))
# return entities
def get_model(model_name: str = None):
if model_name is None:
model_name = "urchade/gliner_base"
global _MODEL
if _MODEL.get(model_name) is None:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def present(email_content, labels):
email = accept_mail(email_content)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
# entity_info = '\n'.join([f"{text}: {label}" for text, label in entities])
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Cleaned Body": further_cleaned_text,
# "Extracted Entities": entity_info
}
return [email_info[key] for key in email_info]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER",
"ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY",
"MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.Textbox(label="Email Content"),
gr.components.CheckboxGroup(label="Labels to Detect", choices=labels, default=labels)
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Textbox(label="Cleaned Body"),
# gr.components.Textbox(label="Extracted Entities")
],
title="Email Info",
description="Enter the email content below to view its details and detected entities."
)
demo.launch()
|