Spaces:

khoatran94
/

cv_ocr_gradio

Sleeping

App Files Files Community

khoatran94 commited on Nov 20, 2024

Commit

5823725

1 Parent(s): 84b864d

1st init

Browse files

Files changed (4) hide show

app.py +88 -0
packages.txt +2 -0
prepare.py +29 -0
requirements.txt +19 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from PIL import Image
+import pytesseract
+import os
+import pymupdf
+import streamlit as st
+import gradio as gr
+from prepare import prepare
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from langchain.llms import HuggingFacePipeline
+from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
+from langchain_community.vectorstores.utils import filter_complex_metadata
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.schema.runnable import RunnablePassthrough
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_community.llms import HuggingFaceEndpoint
+from dotenv import load_dotenv
+def read_pdf(file_path):
+    output = ''
+    doc = pymupdf.open(file_path)
+    for page in range(len(doc)):
+        text = doc[page].get_text().encode("utf8")
+        if text:
+            output += text.decode('utf-8')
+        else:
+            image_list = doc[page].get_images()
+            for image_index, img in enumerate(image_list, start=1):  # enumerate the image list
+                xref = img[0]  # get the XREF of the image
+                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap
+                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
+                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                path = "page_{}-image_{}.png".format(page, image_index)
+                pix.save(path)  # save the image as png
+                img = Image.open(path)
+                pix = None
+                output += pytesseract.image_to_string(img, lang='vie') + '\n'
+    return output
+# Function to query Hugging Face endpoint
+def query_huggingface(text):
+    load_dotenv()
+    api_token = os.getenv("API_TOKEN")
+    repo_id = "google/gemma-2-9b-it"
+    task = "text-generation"
+    chat_model = HuggingFaceEndpoint(
+        huggingfacehub_api_token=api_token,
+        repo_id=repo_id,
+        task=task
+    )
+    return chat_model.invoke(text)
+# Gradio Interface for PDF Processing
+def process_file(file):
+    temp_file_path = "temp_uploaded_file"
+    with open(temp_file_path, "wb") as temp_file:
+        temp_file.write(file.read())
+    pdf_output = read_pdf(temp_file_path)
+    return pdf_output
+# Create Gradio App
+interface = gr.Interface(
+    fn=process_file,
+    inputs=[
+        gr.File(label="Upload a PDF file"),
+        gr.Textbox(label="Enter your query for Hugging Face"),
+    ],
+    outputs=[
+        gr.Textbox(label="PDF Content"),
+        gr.Textbox(label="Hugging Face Output"),
+    ],
+    title="PDF Processor with Hugging Face Query"
+)
+# Launch the Gradio App
+if __name__ == "__main__":
+    prepare()
+    interface.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ tesseract-ocr -y
2	+ libtesseract-dev -y

prepare.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import shutil
+import requests
+def prepare():
+    url = "https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/vie.traineddata"
+    # Destination file path
+    destination_path = "vie.traineddata"
+    try:
+        print(f"Downloading from {url}...")
+        response = requests.get(url, stream=True)
+        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
+        # Write the content to a file
+        destination_path = 'vie.traineddata'
+        with open(destination_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):  # Download in chunks
+                file.write(chunk)
+        print(f"File downloaded successfully and saved as {destination_path}")
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+    destination_folder = '/usr/share/tesseract-ocr/5/tessdata'
+    destination_file = os.path.join(destination_folder, os.path.basename(destination_path))
+    shutil.copy(destination_path, destination_file)
+    print(f"File copied successfully to {destination_file}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+requests
+pytesseract
+pymupdf
+python-dotenv
+langchain
+langchain_huggingface
+langchain_experimental
+langchain-google-genai
+langchain-core
+langchain-community
+huggingface-hub
+transformers
+bitsandbytes
+torch
+pillow
+sentence-transformers
+faiss-cpu
+bs4
+accelerate