Spaces:

zeonai
/

parser_benchmarking

Runtime error

chandanzeon commited on Feb 4

Commit

2df4c2a

verified ·

1 Parent(s): 028c3e1

Added LLama Module (#2)

- Added LLama Module (b82470a4e8b4a638eeec7f40ca6d83258b4f21a6)

Co-authored-by: Chandan Kumar Nayak <[email protected]>

Files changed (2) hide show

app.py CHANGED Viewed

@@ -19,12 +19,17 @@ if uploaded_file:
     try:
         with st.spinner("Processing document..."):
-            docspaddle, docsdocling = process_docs(doc_path)
             if os.path.exists("./Tested_Docs"):
                 shutil.rmtree('./Tested_Docs')
     except Exception as e:
         st.warning(e)
     st.markdown("### Extracted Text by Docling-OCR :")
     for page_number, txt in docsdocling.items():
         st.markdown(f"#### Page {page_number}")
@@ -33,4 +38,4 @@ if uploaded_file:
     st.markdown("### Extracted Text by Paddle-OCR :")
     for page_number, txt in enumerate(docspaddle):
         st.markdown(f"#### Page {page_number+1}")
-        st.text(txt)

     try:
         with st.spinner("Processing document..."):
+            docsllama, docspaddle, docsdocling = process_docs(doc_path)
             if os.path.exists("./Tested_Docs"):
                 shutil.rmtree('./Tested_Docs')
     except Exception as e:
         st.warning(e)
+    st.markdown("### Extracted Text by Llama-Parser :")
+    for page_number, txt in enumerate(docsllama):
+        st.markdown(f"#### Page {page_number+1}")
+        st.text(txt)
     st.markdown("### Extracted Text by Docling-OCR :")
     for page_number, txt in docsdocling.items():
         st.markdown(f"#### Page {page_number}")
     st.markdown("### Extracted Text by Paddle-OCR :")
     for page_number, txt in enumerate(docspaddle):
         st.markdown(f"#### Page {page_number+1}")
+        st.text(txt)

helper.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from docling.document_converter import DocumentConverter
 from paddleocr import PaddleOCR
 from pdf2image import convert_from_path
 import numpy as np
 def process_text(res):
     page_texts = {}
@@ -73,6 +76,17 @@ def process_docs(doc_path):
         query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
     """
     ## Paddle OCR
     ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
     images_pdf = convert_from_path(doc_path, 300)
@@ -90,4 +104,4 @@ def process_docs(doc_path):
     docs3 = process_text(res)
     docs3 = process_tables(res,docs3)
-    return docs2,docs3

 from docling.document_converter import DocumentConverter
 from paddleocr import PaddleOCR
+from llama_parse import LlamaParse
 from pdf2image import convert_from_path
 import numpy as np
+import os
+llama_key = os.getenv('LLAMA_INDEX_API_KEY')
 def process_text(res):
     page_texts = {}
         query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
     """
+    ## LLama Parser
+    parser = LlamaParse(
+        api_key=llama_key,
+        result_type='markdown',
+        verbose=True,
+        language='en',
+        num_workers=2
+    )
+    documents = parser.load_data(doc_path)
+    docs = [doc.text for doc in documents]
     ## Paddle OCR
     ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
     images_pdf = convert_from_path(doc_path, 300)
     docs3 = process_text(res)
     docs3 = process_tables(res,docs3)
+    return docs, docs2, docs3