Spaces:

Temuzin64
/

Telugu_TextExtraction

Sleeping

App Files Files Community

Temuzin64 commited on May 26

Commit

2d493f2

verified ·

1 Parent(s): 877aa9d

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +61 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,63 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+from PIL import Image, ImageFilter, ImageEnhance
+import tempfile
+import os
+import easyocr
+from transformers import MT5ForConditionalGeneration, MT5Tokenizer, pipeline
+# Load tokenizer and model once at startup with proper config to avoid warnings
+tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", legacy=False, use_fast=False)
+model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+# Preprocess uploaded image to improve OCR accuracy
+def preprocess_image_pillow(image):
+    img = image.convert("L")  # Grayscale
+    width, height = img.size
+    img = img.resize((width * 2, height * 2), Image.LANCZOS)
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(2.0)
+    img = img.filter(ImageFilter.SHARPEN)
+    return img
+# Streamlit App UI
+st.set_page_config(page_title="📝 Telugu OCR & Correction", layout="centered")
+st.title("📝 Telugu Handwriting to Typed Text")
+uploaded_file = st.file_uploader("📤 Upload Telugu handwritten image", type=["png", "jpg", "jpeg"])
+if uploaded_file:
+    image = Image.open(uploaded_file).convert("RGB")
+    enhanced_image = preprocess_image_pillow(image)
+    st.image(enhanced_image, caption="Preprocessed Image", use_container_width=True)
+    # Save temporarily for EasyOCR
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp:
+        enhanced_image.save(temp.name)
+    try:
+        reader = easyocr.Reader(['te'], gpu=False)
+        results = reader.readtext(temp.name)
+        raw_text = "\n".join([text for (_, text, _) in results])
+        st.markdown("### 📄 OCR Extracted Text")
+        st.text_area("📝 Telugu OCR", raw_text, height=150)
+        # Generate correction using mT5
+        if raw_text.strip():
+            st.markdown("### ✅ LLM Corrected Telugu Text")
+            prompt = f"Correct the following Telugu text spelling and grammar:\n{raw_text}"
+            try:
+                response = pipe(prompt, max_new_tokens=256, do_sample=False)[0]['generated_text']
+                st.text_area("🤖 Corrected Text", response, height=150)
+                st.download_button("⬇️ Download", response, file_name="corrected_telugu.txt")
+            except Exception as e:
+                st.error(f"LLM Correction Error: {e}")
+        else:
+            st.warning("OCR did not extract any usable Telugu text.")
+    finally:
+        # Always remove the temp file
+        if os.path.exists(temp.name):
+            os.remove(temp.name)