khoatran94 commited on
Commit
5823725
·
1 Parent(s): 84b864d
Files changed (4) hide show
  1. app.py +88 -0
  2. packages.txt +2 -0
  3. prepare.py +29 -0
  4. requirements.txt +19 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import pytesseract
3
+ import os
4
+ import pymupdf
5
+
6
+ import streamlit as st
7
+ import gradio as gr
8
+ from prepare import prepare
9
+
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
11
+ from langchain.llms import HuggingFacePipeline
12
+ from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
13
+ from langchain_core.output_parsers import StrOutputParser
14
+ from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
15
+ from langchain_community.vectorstores.utils import filter_complex_metadata
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain_community.embeddings import HuggingFaceEmbeddings
18
+ from langchain_community.vectorstores import FAISS
19
+ from langchain.schema.runnable import RunnablePassthrough
20
+ from langchain_core.messages import AIMessage, HumanMessage
21
+ from langchain_community.llms import HuggingFaceEndpoint
22
+ from dotenv import load_dotenv
23
+
24
+
25
+
26
+
27
+ def read_pdf(file_path):
28
+ output = ''
29
+ doc = pymupdf.open(file_path)
30
+ for page in range(len(doc)):
31
+ text = doc[page].get_text().encode("utf8")
32
+ if text:
33
+ output += text.decode('utf-8')
34
+ else:
35
+ image_list = doc[page].get_images()
36
+ for image_index, img in enumerate(image_list, start=1): # enumerate the image list
37
+ xref = img[0] # get the XREF of the image
38
+ pix = pymupdf.Pixmap(doc, xref) # create a Pixmap
39
+
40
+ if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
41
+ pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
42
+
43
+ path = "page_{}-image_{}.png".format(page, image_index)
44
+ pix.save(path) # save the image as png
45
+ img = Image.open(path)
46
+ pix = None
47
+ output += pytesseract.image_to_string(img, lang='vie') + '\n'
48
+ return output
49
+
50
+ # Function to query Hugging Face endpoint
51
+ def query_huggingface(text):
52
+ load_dotenv()
53
+ api_token = os.getenv("API_TOKEN")
54
+ repo_id = "google/gemma-2-9b-it"
55
+ task = "text-generation"
56
+ chat_model = HuggingFaceEndpoint(
57
+ huggingfacehub_api_token=api_token,
58
+ repo_id=repo_id,
59
+ task=task
60
+ )
61
+ return chat_model.invoke(text)
62
+
63
+ # Gradio Interface for PDF Processing
64
+ def process_file(file):
65
+ temp_file_path = "temp_uploaded_file"
66
+ with open(temp_file_path, "wb") as temp_file:
67
+ temp_file.write(file.read())
68
+ pdf_output = read_pdf(temp_file_path)
69
+ return pdf_output
70
+
71
+ # Create Gradio App
72
+ interface = gr.Interface(
73
+ fn=process_file,
74
+ inputs=[
75
+ gr.File(label="Upload a PDF file"),
76
+ gr.Textbox(label="Enter your query for Hugging Face"),
77
+ ],
78
+ outputs=[
79
+ gr.Textbox(label="PDF Content"),
80
+ gr.Textbox(label="Hugging Face Output"),
81
+ ],
82
+ title="PDF Processor with Hugging Face Query"
83
+ )
84
+
85
+ # Launch the Gradio App
86
+ if __name__ == "__main__":
87
+ prepare()
88
+ interface.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr -y
2
+ libtesseract-dev -y
prepare.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import requests
4
+
5
+ def prepare():
6
+ url = "https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/vie.traineddata"
7
+
8
+ # Destination file path
9
+ destination_path = "vie.traineddata"
10
+
11
+ try:
12
+ print(f"Downloading from {url}...")
13
+ response = requests.get(url, stream=True)
14
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
15
+
16
+ # Write the content to a file
17
+ destination_path = 'vie.traineddata'
18
+ with open(destination_path, "wb") as file:
19
+ for chunk in response.iter_content(chunk_size=8192): # Download in chunks
20
+ file.write(chunk)
21
+
22
+ print(f"File downloaded successfully and saved as {destination_path}")
23
+ except requests.exceptions.RequestException as e:
24
+ print(f"An error occurred: {e}")
25
+
26
+ destination_folder = '/usr/share/tesseract-ocr/5/tessdata'
27
+ destination_file = os.path.join(destination_folder, os.path.basename(destination_path))
28
+ shutil.copy(destination_path, destination_file)
29
+ print(f"File copied successfully to {destination_file}")
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ pytesseract
3
+ pymupdf
4
+ python-dotenv
5
+ langchain
6
+ langchain_huggingface
7
+ langchain_experimental
8
+ langchain-google-genai
9
+ langchain-core
10
+ langchain-community
11
+ huggingface-hub
12
+ transformers
13
+ bitsandbytes
14
+ torch
15
+ pillow
16
+ sentence-transformers
17
+ faiss-cpu
18
+ bs4
19
+ accelerate