Spaces:
Sleeping
Sleeping
File size: 3,577 Bytes
8efe659 bd2022e 2fe2a42 13ac7c2 86fc40d bd2022e 9de97c6 bd2022e adc4ff3 bd2022e adc4ff3 86fc40d 2fe2a42 3fd92e9 13ac7c2 5777a9a 9de97c6 5777a9a f9779a0 9de97c6 86fc40d bd2022e 9de97c6 28ca0f2 1efe83d 5777a9a f9779a0 9de97c6 bd2022e 13ac7c2 56c79b1 9de97c6 bd2022e 56c79b1 1efe83d f9779a0 9de97c6 3fd92e9 9de97c6 3fd92e9 9de97c6 bd2022e 3fd92e9 9de97c6 3fd92e9 56c79b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
from mailparser import parse_from_string
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
# nlp = spacy.load("en_core_web_sm")
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(email_content):
email = parse_from_string(email_content)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
# doc = nlp(text)
# entities = []
# for ent in doc.ents:
# if ent.label_ in labels:
# entities.append((ent.text, ent.label_))
# return entities
def get_model(model_name: str = None):
if model_name is None:
model_name = "urchade/gliner_base"
global _MODEL
if _MODEL.get(model_name) is None:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def parse_query(sentences: List[str], labels: Union[str, list], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None) -> List[Dict[str, Union[str, list]]]:
model = get_model(model_name)
if isinstance(labels, str):
labels = [i.strip() for i in labels.split(",")]
results = []
for sentence in sentences:
_entities = model.predict_entities(sentence, labels, threshold=threshold)
entities = []
for entity in _entities:
entities.append(entity)
results.append({"sentence": sentence, "entities": entities})
return results
def present(email_content, labels):
email = accept_mail(email_content)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
# entity_info = '\n'.join([f"{text}: {label}" for text, label in entities])
result = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base")
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Cleaned Body": further_cleaned_text,
"Extracted Entities": result
}
return [email_info[key] for key in email_info]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER",
"ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY",
"MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.Textbox(label="Email Content"),
gr.components.CheckboxGroup(label="Labels to Detect", choices=labels, default=labels)
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Textbox(label="Cleaned Body"),
gr.components.Textbox(label="Extracted Entities")
],
title="Email Info",
description="Enter the email content below to view its details and detected entities."
)
demo.launch()
|