en_motorinc_nlp-0.0.2 / custom_spacy_normalizer.py
karthik-raju's picture
Update spaCy pipeline
2f1c8b2 verified
from spacy.language import Language
from spacy.tokens import Span
import re
# Define the price normalization component globally
@Language.component("normalize_prices")
def normalize_prices(doc):
if not Span.has_extension("normalized_value"):
Span.set_extension("normalized_value", default=None)
for ent in doc.ents:
if ent.label_ == "PRICE":
try:
text = " ".join(ent.text.lower().split())
number = float(re.findall(r"\d+(?:\.\d+)?", text)[0])
if any(suffix in text for suffix in ["cr", "crore", "crores"]):
normalized = number * 10000000
elif any(
suffix in text for suffix in ["l", "lakh", "lakhs", "lac", "lacs"]
):
normalized = number * 100000
elif any(suffix in text for suffix in ["k"]):
normalized = number * 1000
else:
normalized = number
ent._.normalized_value = normalized
except Exception as e:
print(f"Error normalizing price {ent.text}: {str(e)}")
return doc
@Language.component("normalize_engine_displacement")
def normalize_engine_displacement(doc):
if not Span.has_extension("normalized_value"):
Span.set_extension("normalized_value", default=None)
for ent in doc.ents:
if ent.label_ == "EngineDisplacement":
try:
text = " ".join(ent.text.lower().split())
number = float(re.findall(r"\d+(?:\.\d+)?", text)[0])
normalized = None
if any(suffix in text for suffix in ["l", "L", "liter", "litre", "cc"]):
normalized = number
if normalized is None:
return
ent._.normalized_value = normalized
except Exception as e:
print(f"""Error normalizing engine displacement {
ent.text}: {str(e)}""")
return doc