Spaces:
Sleeping
Sleeping
Commit
·
5823725
1
Parent(s):
84b864d
1st init
Browse files- app.py +88 -0
- packages.txt +2 -0
- prepare.py +29 -0
- requirements.txt +19 -0
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
import pytesseract
|
3 |
+
import os
|
4 |
+
import pymupdf
|
5 |
+
|
6 |
+
import streamlit as st
|
7 |
+
import gradio as gr
|
8 |
+
from prepare import prepare
|
9 |
+
|
10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
11 |
+
from langchain.llms import HuggingFacePipeline
|
12 |
+
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
13 |
+
from langchain_core.output_parsers import StrOutputParser
|
14 |
+
from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
|
15 |
+
from langchain_community.vectorstores.utils import filter_complex_metadata
|
16 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
18 |
+
from langchain_community.vectorstores import FAISS
|
19 |
+
from langchain.schema.runnable import RunnablePassthrough
|
20 |
+
from langchain_core.messages import AIMessage, HumanMessage
|
21 |
+
from langchain_community.llms import HuggingFaceEndpoint
|
22 |
+
from dotenv import load_dotenv
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def read_pdf(file_path):
|
28 |
+
output = ''
|
29 |
+
doc = pymupdf.open(file_path)
|
30 |
+
for page in range(len(doc)):
|
31 |
+
text = doc[page].get_text().encode("utf8")
|
32 |
+
if text:
|
33 |
+
output += text.decode('utf-8')
|
34 |
+
else:
|
35 |
+
image_list = doc[page].get_images()
|
36 |
+
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
|
37 |
+
xref = img[0] # get the XREF of the image
|
38 |
+
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap
|
39 |
+
|
40 |
+
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
|
41 |
+
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
42 |
+
|
43 |
+
path = "page_{}-image_{}.png".format(page, image_index)
|
44 |
+
pix.save(path) # save the image as png
|
45 |
+
img = Image.open(path)
|
46 |
+
pix = None
|
47 |
+
output += pytesseract.image_to_string(img, lang='vie') + '\n'
|
48 |
+
return output
|
49 |
+
|
50 |
+
# Function to query Hugging Face endpoint
|
51 |
+
def query_huggingface(text):
|
52 |
+
load_dotenv()
|
53 |
+
api_token = os.getenv("API_TOKEN")
|
54 |
+
repo_id = "google/gemma-2-9b-it"
|
55 |
+
task = "text-generation"
|
56 |
+
chat_model = HuggingFaceEndpoint(
|
57 |
+
huggingfacehub_api_token=api_token,
|
58 |
+
repo_id=repo_id,
|
59 |
+
task=task
|
60 |
+
)
|
61 |
+
return chat_model.invoke(text)
|
62 |
+
|
63 |
+
# Gradio Interface for PDF Processing
|
64 |
+
def process_file(file):
|
65 |
+
temp_file_path = "temp_uploaded_file"
|
66 |
+
with open(temp_file_path, "wb") as temp_file:
|
67 |
+
temp_file.write(file.read())
|
68 |
+
pdf_output = read_pdf(temp_file_path)
|
69 |
+
return pdf_output
|
70 |
+
|
71 |
+
# Create Gradio App
|
72 |
+
interface = gr.Interface(
|
73 |
+
fn=process_file,
|
74 |
+
inputs=[
|
75 |
+
gr.File(label="Upload a PDF file"),
|
76 |
+
gr.Textbox(label="Enter your query for Hugging Face"),
|
77 |
+
],
|
78 |
+
outputs=[
|
79 |
+
gr.Textbox(label="PDF Content"),
|
80 |
+
gr.Textbox(label="Hugging Face Output"),
|
81 |
+
],
|
82 |
+
title="PDF Processor with Hugging Face Query"
|
83 |
+
)
|
84 |
+
|
85 |
+
# Launch the Gradio App
|
86 |
+
if __name__ == "__main__":
|
87 |
+
prepare()
|
88 |
+
interface.launch()
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
tesseract-ocr -y
|
2 |
+
libtesseract-dev -y
|
prepare.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import requests
|
4 |
+
|
5 |
+
def prepare():
|
6 |
+
url = "https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/vie.traineddata"
|
7 |
+
|
8 |
+
# Destination file path
|
9 |
+
destination_path = "vie.traineddata"
|
10 |
+
|
11 |
+
try:
|
12 |
+
print(f"Downloading from {url}...")
|
13 |
+
response = requests.get(url, stream=True)
|
14 |
+
response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
|
15 |
+
|
16 |
+
# Write the content to a file
|
17 |
+
destination_path = 'vie.traineddata'
|
18 |
+
with open(destination_path, "wb") as file:
|
19 |
+
for chunk in response.iter_content(chunk_size=8192): # Download in chunks
|
20 |
+
file.write(chunk)
|
21 |
+
|
22 |
+
print(f"File downloaded successfully and saved as {destination_path}")
|
23 |
+
except requests.exceptions.RequestException as e:
|
24 |
+
print(f"An error occurred: {e}")
|
25 |
+
|
26 |
+
destination_folder = '/usr/share/tesseract-ocr/5/tessdata'
|
27 |
+
destination_file = os.path.join(destination_folder, os.path.basename(destination_path))
|
28 |
+
shutil.copy(destination_path, destination_file)
|
29 |
+
print(f"File copied successfully to {destination_file}")
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
pytesseract
|
3 |
+
pymupdf
|
4 |
+
python-dotenv
|
5 |
+
langchain
|
6 |
+
langchain_huggingface
|
7 |
+
langchain_experimental
|
8 |
+
langchain-google-genai
|
9 |
+
langchain-core
|
10 |
+
langchain-community
|
11 |
+
huggingface-hub
|
12 |
+
transformers
|
13 |
+
bitsandbytes
|
14 |
+
torch
|
15 |
+
pillow
|
16 |
+
sentence-transformers
|
17 |
+
faiss-cpu
|
18 |
+
bs4
|
19 |
+
accelerate
|