Gopal2002 chandanzeon commited on
Commit
2df4c2a
·
verified ·
1 Parent(s): 028c3e1

Added LLama Module (#2)

Browse files

- Added LLama Module (b82470a4e8b4a638eeec7f40ca6d83258b4f21a6)


Co-authored-by: Chandan Kumar Nayak <[email protected]>

Files changed (2) hide show
  1. app.py +7 -2
  2. helper.py +15 -1
app.py CHANGED
@@ -19,12 +19,17 @@ if uploaded_file:
19
 
20
  try:
21
  with st.spinner("Processing document..."):
22
- docspaddle, docsdocling = process_docs(doc_path)
23
  if os.path.exists("./Tested_Docs"):
24
  shutil.rmtree('./Tested_Docs')
25
  except Exception as e:
26
  st.warning(e)
27
 
 
 
 
 
 
28
  st.markdown("### Extracted Text by Docling-OCR :")
29
  for page_number, txt in docsdocling.items():
30
  st.markdown(f"#### Page {page_number}")
@@ -33,4 +38,4 @@ if uploaded_file:
33
  st.markdown("### Extracted Text by Paddle-OCR :")
34
  for page_number, txt in enumerate(docspaddle):
35
  st.markdown(f"#### Page {page_number+1}")
36
- st.text(txt)
 
19
 
20
  try:
21
  with st.spinner("Processing document..."):
22
+ docsllama, docspaddle, docsdocling = process_docs(doc_path)
23
  if os.path.exists("./Tested_Docs"):
24
  shutil.rmtree('./Tested_Docs')
25
  except Exception as e:
26
  st.warning(e)
27
 
28
+ st.markdown("### Extracted Text by Llama-Parser :")
29
+ for page_number, txt in enumerate(docsllama):
30
+ st.markdown(f"#### Page {page_number+1}")
31
+ st.text(txt)
32
+
33
  st.markdown("### Extracted Text by Docling-OCR :")
34
  for page_number, txt in docsdocling.items():
35
  st.markdown(f"#### Page {page_number}")
 
38
  st.markdown("### Extracted Text by Paddle-OCR :")
39
  for page_number, txt in enumerate(docspaddle):
40
  st.markdown(f"#### Page {page_number+1}")
41
+ st.text(txt)
helper.py CHANGED
@@ -1,7 +1,10 @@
1
  from docling.document_converter import DocumentConverter
2
  from paddleocr import PaddleOCR
 
3
  from pdf2image import convert_from_path
4
  import numpy as np
 
 
5
 
6
  def process_text(res):
7
  page_texts = {}
@@ -73,6 +76,17 @@ def process_docs(doc_path):
73
  query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
74
  """
75
 
 
 
 
 
 
 
 
 
 
 
 
76
  ## Paddle OCR
77
  ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
78
  images_pdf = convert_from_path(doc_path, 300)
@@ -90,4 +104,4 @@ def process_docs(doc_path):
90
  docs3 = process_text(res)
91
  docs3 = process_tables(res,docs3)
92
 
93
- return docs2,docs3
 
1
  from docling.document_converter import DocumentConverter
2
  from paddleocr import PaddleOCR
3
+ from llama_parse import LlamaParse
4
  from pdf2image import convert_from_path
5
  import numpy as np
6
+ import os
7
+ llama_key = os.getenv('LLAMA_INDEX_API_KEY')
8
 
9
  def process_text(res):
10
  page_texts = {}
 
76
  query_engine_llama, query_engine_paddle, images: Query engines for LlamaParse and PaddleOCR, and a list of extracted images.
77
  """
78
 
79
+ ## LLama Parser
80
+ parser = LlamaParse(
81
+ api_key=llama_key,
82
+ result_type='markdown',
83
+ verbose=True,
84
+ language='en',
85
+ num_workers=2
86
+ )
87
+ documents = parser.load_data(doc_path)
88
+ docs = [doc.text for doc in documents]
89
+
90
  ## Paddle OCR
91
  ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
92
  images_pdf = convert_from_path(doc_path, 300)
 
104
  docs3 = process_text(res)
105
  docs3 = process_tables(res,docs3)
106
 
107
+ return docs, docs2, docs3