Spaces:
Sleeping
Sleeping
File size: 2,434 Bytes
250467d f07599d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import streamlit as st
from PIL import Image
import re
from transformers import AutoModel, AutoTokenizer
st.set_page_config(page_title="OCR Application", page_icon="🖼️", layout="wide")
device = "cpu"
@st.cache_resource
#def load_model():
#processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
#model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten', device_map='cpu')
#@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, device_map='cpu')
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cpu', use_safetensors=True)
processor=tokenizer
return processor, model
def extract_text(image, processor, model):
# Preprocess the image and extract text
pixel_values = processor(images=image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
extracted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return extracted_text
def highlight_matches(text, keywords):
# Highlight keywords in the extracted text
pattern = re.compile(f"({re.escape(keywords)})", re.IGNORECASE)
highlighted_text = pattern.sub(r"<mark>\1</mark>", text)
return highlighted_text
def main():
st.title("OCR Text Extractor using Hugging Face Model")
# Load model and processor
processor, model = load_model()
# Upload Image
uploaded_file = st.file_uploader("Upload an image for OCR", type=["png", "jpg", "jpeg"])
if uploaded_file:
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
# Extract text from the image
with st.spinner("Extracting text from the image..."):
extracted_text = extract_text(image, processor, model)
st.subheader("Extracted Text")
st.text_area("Text from Image", extracted_text, height=300)
# Keyword search
st.subheader("Keyword Search")
keywords = st.text_input("Enter keywords to search:")
if st.button("Search"):
highlighted_text = highlight_matches(extracted_text, keywords)
st.subheader("Search Results")
st.markdown(highlighted_text, unsafe_allow_html=True)
if __name__ == "__main__":
main()
|