Spaces:
Runtime error
Runtime error
Added LLama Module (#2)
Browse files- Added LLama Module (b82470a4e8b4a638eeec7f40ca6d83258b4f21a6)
Co-authored-by: Chandan Kumar Nayak <[email protected]>
app.py
CHANGED
@@ -19,12 +19,17 @@ if uploaded_file:
|
|
19 |
|
20 |
try:
|
21 |
with st.spinner("Processing document..."):
|
22 |
-
docspaddle, docsdocling = process_docs(doc_path)
|
23 |
if os.path.exists("./Tested_Docs"):
|
24 |
shutil.rmtree('./Tested_Docs')
|
25 |
except Exception as e:
|
26 |
st.warning(e)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
st.markdown("### Extracted Text by Docling-OCR :")
|
29 |
for page_number, txt in docsdocling.items():
|
30 |
st.markdown(f"#### Page {page_number}")
|
@@ -33,4 +38,4 @@ if uploaded_file:
|
|
33 |
st.markdown("### Extracted Text by Paddle-OCR :")
|
34 |
for page_number, txt in enumerate(docspaddle):
|
35 |
st.markdown(f"#### Page {page_number+1}")
|
36 |
-
st.text(txt)
|
|
|
19 |
|
20 |
try:
|
21 |
with st.spinner("Processing document..."):
|
22 |
+
docsllama, docspaddle, docsdocling = process_docs(doc_path)
|
23 |
if os.path.exists("./Tested_Docs"):
|
24 |
shutil.rmtree('./Tested_Docs')
|
25 |
except Exception as e:
|
26 |
st.warning(e)
|
27 |
|
28 |
+
st.markdown("### Extracted Text by Llama-Parser :")
|
29 |
+
for page_number, txt in enumerate(docsllama):
|
30 |
+
st.markdown(f"#### Page {page_number+1}")
|
31 |
+
st.text(txt)
|
32 |
+
|
33 |
st.markdown("### Extracted Text by Docling-OCR :")
|
34 |
for page_number, txt in docsdocling.items():
|
35 |
st.markdown(f"#### Page {page_number}")
|
|
|
38 |
st.markdown("### Extracted Text by Paddle-OCR :")
|
39 |
for page_number, txt in enumerate(docspaddle):
|
40 |
st.markdown(f"#### Page {page_number+1}")
|
41 |
+
st.text(txt)
|
helper.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
from docling.document_converter import DocumentConverter
|
2 |
from paddleocr import PaddleOCR
|
|
|
3 |
from pdf2image import convert_from_path
|
4 |
import numpy as np
|
|
|
|
|
5 |
|
6 |
def process_text(res):
|
7 |
page_texts = {}
|
@@ -73,6 +76,17 @@ def process_docs(doc_path):
|
|
73 |
query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
|
74 |
"""
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
## Paddle OCR
|
77 |
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
|
78 |
images_pdf = convert_from_path(doc_path, 300)
|
@@ -90,4 +104,4 @@ def process_docs(doc_path):
|
|
90 |
docs3 = process_text(res)
|
91 |
docs3 = process_tables(res,docs3)
|
92 |
|
93 |
-
return docs2,docs3
|
|
|
1 |
from docling.document_converter import DocumentConverter
|
2 |
from paddleocr import PaddleOCR
|
3 |
+
from llama_parse import LlamaParse
|
4 |
from pdf2image import convert_from_path
|
5 |
import numpy as np
|
6 |
+
import os
|
7 |
+
llama_key = os.getenv('LLAMA_INDEX_API_KEY')
|
8 |
|
9 |
def process_text(res):
|
10 |
page_texts = {}
|
|
|
76 |
query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
|
77 |
"""
|
78 |
|
79 |
+
## LLama Parser
|
80 |
+
parser = LlamaParse(
|
81 |
+
api_key=llama_key,
|
82 |
+
result_type='markdown',
|
83 |
+
verbose=True,
|
84 |
+
language='en',
|
85 |
+
num_workers=2
|
86 |
+
)
|
87 |
+
documents = parser.load_data(doc_path)
|
88 |
+
docs = [doc.text for doc in documents]
|
89 |
+
|
90 |
## Paddle OCR
|
91 |
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
|
92 |
images_pdf = convert_from_path(doc_path, 300)
|
|
|
104 |
docs3 = process_text(res)
|
105 |
docs3 = process_tables(res,docs3)
|
106 |
|
107 |
+
return docs, docs2, docs3
|