SalmanML's picture
Update app.py
351c297 verified
# from flair.data import Sentence
# from flair.models import SequenceTagger
# import streamlit as st
# # load tagger
# tagger = SequenceTagger.load("flair/ner-english-large")
# # make example sentence
# text=st.text_area("Enter the text to detect it's named entities")
# sentence = Sentence(text)
# # predict NER tags
# tagger.predict(sentence)
# # print sentence
# print(sentence)
# # print predicted NER spans
# print('The following NER tags are found:')
# # iterate over entities and printx
# for entity in sentence.get_spans('ner'):
# print(entity)
# import easyocr
# import cv2
# import requests
# import re
# from PIL import Image
# import streamlit as st
# # import os
# # Load the EasyOCR reader
# reader = easyocr.Reader(['en'])
# # key=os.environ.getattribute("api_key")
# # print(key)
# API_URL = "https://api-inference.huggingface.co/models/flair/ner-english-large"
# headers = {"Authorization": st.secrets["api_key"]}
# ## Image uploading function ##
# def image_upload_and_ocr(reader):
# uploaded_file=st.file_uploader(label=':red[**please upload a busines card** :sunglasses:]',type=['jpeg','jpg','png','webp'])
# if uploaded_file is not None:
# image=Image.open(uploaded_file)
# image=image.resize((640,480))
# result2 = reader.readtext(image)
# # result2=result
# texts = [item[1] for item in result2]
# result=' '.join(texts)
# return result2,result
# def query(payload):
# response = requests.post(API_URL, headers=headers, json=payload)
# return response.json()
# def get_ner_from_transformer(output):
# data = output
# named_entities = {}
# for entity in data:
# entity_type = entity['entity_group']
# entity_text = entity['word']
# if entity_type not in named_entities:
# named_entities[entity_type] = []
# named_entities[entity_type].append(entity_text)
# # for entity_type, entities in named_entities.items():
# # print(f"{entity_type}: {', '.join(entities)}")
# return entity_type,named_entities
# ### DRAWING DETECTION FUNCTION ###
# def drawing_detection(image):
# # Draw bounding boxes around the detected text regions
# for detection in image:
# # Extract the bounding box coordinates
# points = detection[0] # List of points defining the bounding box
# x1, y1 = int(points[0][0]), int(points[0][1]) # Top-left corner
# x2, y2 = int(points[2][0]), int(points[2][1]) # Bottom-right corner
# # Draw the bounding box
# cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
# # Add the detected text
# text = detection[1]
# cv2.putText(image, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
# st.image(image,caption='Detected text on the card ',width=710)
# return image
# st.title("_Business_ card data extractor using opencv and streamlit :sunglasses:")
# res2,res=image_upload_and_ocr(reader)
# darwing_image=drawing_detection(res2)
# output = query({
# "inputs": res,
# })
# entity_type,named_entities= get_ner_from_transformer(output)
# st.write(entity_type)
# st.write(named_entities)
import easyocr
import cv2
import requests
import re
from PIL import Image
import streamlit as st
import numpy as np
# Load the EasyOCR reader
reader = easyocr.Reader(['en'])
API_URL = "https://api-inference.huggingface.co/models/flair/ner-english-large"
headers = {"Authorization": st.secrets["api_key"]}
## Image uploading function ##
def image_upload_and_ocr(reader, uploaded_file):
if uploaded_file is not None:
image = Image.open(uploaded_file)
image = image.resize((640, 480))
image_np = np.array(image) # Convert image to NumPy array
result2 = reader.readtext(image_np)
texts = [item[1] for item in result2]
result = ' '.join(texts)
return result2, result, image
else:
return None, None, None
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def get_ner_from_transformer(output):
data = output
named_entities = {}
for entity in data:
entity_type = entity['entity_group']
entity_text = entity['word']
if entity_type not in named_entities:
named_entities[entity_type] = []
named_entities[entity_type].append(entity_text)
return entity_type, named_entities
def drawing_detection(res2, image):
cv2_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Draw bounding boxes around the detected text regions
for detection in res2:
# Extract the bounding box coordinates
points = detection[0] # List of points defining the bounding box
x1, y1 = int(points[0][0]), int(points[0][1]) # Top-left corner
x2, y2 = int(points[2][0]), int(points[2][1]) # Bottom-right corner
# Draw the bounding box
cv2.rectangle(cv2_image, (x1, y1), (x2, y2), (255, 0, 0), 1)
# Add the detected text
text = detection[1]
cv2.putText(cv2_image, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
st.image(cv2_image, caption='Detected text on the card', width=710)
return cv2_image
# Function to extract phone numbers from text using regular expression
def extract_phone_numbers(text):
# Regular expression pattern for detecting phone numbers
PHONE_PATTERN = r'(?:ph|phone|phno)?\s*(?:[+-]?\d\s*[\(\)]*){7,}'
# Find phone numbers using regular expression
phone_numbers = re.findall(PHONE_PATTERN, text, re.IGNORECASE)
# Return the extracted phone numbers
return phone_numbers or None
# Function to extract email addresses from text using regular expression
def extract_email(text):
emails = []
# Regular expression pattern for detecting email addresses with variations
reg = r'[a-z0-9_.-]+(?:\s*@\s*)[a-z]+(?:\s*\.?\s*[a-z]{2,3})\s*'
# Find email addresses using regular expression
res = re.findall(reg, text, re.IGNORECASE)
# Print the extracted email addresses
for email in res:
emails.append(email.strip())
return emails or None
# Function to extract designations from text using regular expression
def extract_designation(text):
designations = []
# Regular expression pattern for detecting designations
designation_regex = r'\b(?:CEO|CFO|CTO|COO|CMO|CIO|President|Vice\s?President|Director|Manager|Executive\s?Director|Assistant\s?Manager|Account\s?Manager|Sales\s?Manager|Marketing\s?Manager|Product\s?Manager|Project\s?Manager|HR\s?Manager|Human\s?Resources\s?Manager|Operations\s?Manager|Business\s?Development\s?Manager|Senior\s?Manager|General\s?Manager|Team\s?Lead|Consultant|Analyst|Engineer|Architect|Designer|Developer|Programmer|Coordinator|Specialist|Supervisor|Administrator|Assistant|Associate|Partner|Founder|Owner|Principal|Expert|Technician|Officer|Representative|Agent|Accountant|Auditor|Trainer|Coach|Educator|Professor|Instructor|Researcher|Scientist|Doctor|Nurse|Therapist|Pharmacist|Attorney|Lawyer|Legal\s?Counsel|Paralegal|Advocate|Solicitor|Notary|Financial\s?Advisor|Investment\s?Advisor|Wealth\s?Manager|Broker|Realtor|Mortgage\s?Broker|Insurance\s?Agent)\b'
# Find designations using regular expression
designations = re.findall(designation_regex, text, re.IGNORECASE)
return designations or None
# Function to extract website URLs from text using regular expression
def extract_websites(text):
websites_found=[]
pattern = r'(https?://)?(www\.)?(\w+)(\.\w+)+'
websites = re.findall(pattern, text)
return ["".join(website) for website in websites] or None
# Function to extract PIN codes from text using regular expression
def extract_pin_code(text):
pin_code_pattern = r'\b\d{6}\b'
pin_code_match = re.search(pin_code_pattern, text.lower())
# Retrieve the PIN code if found
if pin_code_match:
pin_code = pin_code_match.group()
return pin_code
else:
return None
import pandas as pd
# Streamlit UI
st.title("Business Card Data Extractor using OpenCV and Streamlit")
uploaded_file = st.file_uploader(label="Please upload a business card", type=['jpeg', 'jpg', 'png', 'webp'], accept_multiple_files=False)
if uploaded_file is not None:
res2, res, image = image_upload_and_ocr(reader, uploaded_file)
if res2 is not None:
drawing_image = drawing_detection(res2, image)
try:
output = query({
"inputs": res,
})
entity_type, named_entities = get_ner_from_transformer(output)
except Exception as e:
st.error("An error occurred while processing the business card. Please try again later.")
st.error(f"Error details: {str(e)}")
extracted_data = {}
# Function to extract person's name
# Assuming the person's name is extracted by NER
names = named_entities.get("PER", [])
if names:
selected_name = st.selectbox("Select Person's Name:", [""] + names)
if selected_name:
extracted_data["Name"] = selected_name
else:
manual_name = st.text_input("Enter Person's Name manually:")
if manual_name:
extracted_data["Name"] = manual_name
# Function to extract designations
designations = extract_designation(res)
if designations is not None:
selected_designation = st.selectbox("Select Designation:", [""] + designations)
if selected_designation:
extracted_data["Designation"] = selected_designation
else:
manual_designation = st.text_input("Enter Designation manually:")
if manual_designation:
extracted_data["Designation"] = manual_designation
# Function to extract company names
# Assuming the organization names extracted by NER represent company names
company_names = named_entities.get("ORG", [])
if company_names:
selected_company_name = st.selectbox("Select Company Name:", [""] + company_names)
if selected_company_name:
extracted_data["Company Name"] = selected_company_name
else:
manual_company_name = st.text_input("Enter Company Name manually:")
if manual_company_name:
extracted_data["Company Name"] = manual_company_name
# Function to extract email addresses
emails = extract_email(res)
if emails is not None:
selected_email = st.selectbox("Select Email:", [""] + emails)
if selected_email:
extracted_data["Email"] = selected_email
else:
manual_email = st.text_input("Enter Email manually:")
if manual_email:
extracted_data["Email"] = manual_email
# Function to extract website URLs
websites = extract_websites(res)
if websites is not None:
selected_website = st.selectbox("Select Website:", [""] + websites)
if selected_website:
extracted_data["Website"] = selected_website
else:
manual_website = st.text_input("Enter Website manually:")
if manual_website:
extracted_data["Website"] = manual_website
# Function to extract phone numbers
phone_numbers = extract_phone_numbers(res)
if phone_numbers is not None:
selected_phone_number = st.selectbox("Select Phone Number:", [""] + phone_numbers)
if selected_phone_number:
extracted_data["Phone Number"] = selected_phone_number
else:
manual_phone_number = st.text_input("Enter Phone Number manually:")
if manual_phone_number:
extracted_data["Phone Number"] = manual_phone_number
# Concatenate all the text returned by the API for location
locations = named_entities.get("LOC", [])
if locations:
concatenated_location = ", ".join(locations)
selected_location = st.selectbox("Select Location:", [""] + [concatenated_location])
if selected_location:
extracted_data["Location"] = selected_location
else:
manual_location = st.text_input("Enter Location manually:")
if manual_location:
extracted_data["Location"] = manual_location
else:
manual_location = st.text_input("Enter Location manually:")
if manual_location:
extracted_data["Location"] = manual_location
# Function to extract PIN codes
pin_code = extract_pin_code(res)
if pin_code is not None:
selected_pin_code = st.selectbox("Select PIN Code:", ["", pin_code])
if selected_pin_code:
extracted_data["PIN Code"] = selected_pin_code
else:
manual_pin_code = st.text_input("Enter PIN Code manually:")
if manual_pin_code:
extracted_data["PIN Code"] = manual_pin_code
# Display extracted data
if extracted_data:
st.write("Extracted Data:")
df = pd.DataFrame([extracted_data], columns=["Name", "Designation", "Company Name", "Email", "Website", "Phone Number", "Location", "PIN Code"])
st.write(df)