kaiserpister commited on
Commit
737df3f
·
1 Parent(s): 59122b6

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. pdfparser.py +0 -33
pdfparser.py CHANGED
@@ -1,16 +1,12 @@
1
- import io
2
  import os
3
 
4
- import boto3
5
  from langchain.document_loaders import PyPDFium2Loader
6
  from langchain.embeddings.openai import OpenAIEmbeddings
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.vectorstores import FAISS
9
- from pdf2image import convert_from_path
10
  from sllim import chat
11
 
12
  # Standard Textract client setup
13
- textract_client = boto3.client("textract")
14
  template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
15
  DOCUMENTS:
16
  {docs}
@@ -21,29 +17,6 @@ QUERY:
21
  embeddings = OpenAIEmbeddings()
22
 
23
 
24
- def convert_pdf_to_text(pdf_file_path: str):
25
- # Convert the PDF to an in-memory image format
26
- images = convert_from_path(pdf_file_path)
27
-
28
- docs = []
29
- for image in images:
30
- # Convert the image into byte stream
31
- with io.BytesIO() as image_stream:
32
- image.save(image_stream, "JPEG")
33
- image_bytes = image_stream.getvalue()
34
-
35
- # Use Textract to detect text in the local image
36
- response = textract_client.detect_document_text(Document={"Bytes": image_bytes})
37
-
38
- text = ""
39
- # Print the detected text blocks
40
- for item in response["Blocks"]:
41
- if item["BlockType"] == "LINE":
42
- text += item["Text"] + "\n"
43
- docs.append(text)
44
- return docs
45
-
46
-
47
  def process_file(file_path):
48
  index_path = get_index_name(file_path)
49
  if os.path.exists(index_path):
@@ -59,9 +32,6 @@ def process_file(file_path):
59
  length_function=len,
60
  )
61
  docs = text_splitter.split_documents(data)
62
- if len(docs) == 0:
63
- data = convert_pdf_to_text(file_path)
64
- docs = text_splitter.create_documents(data)
65
 
66
  # Embed paragraphs
67
  db = FAISS.from_documents(docs, embeddings)
@@ -118,9 +88,6 @@ def ask_question(query, upload_file, history=None):
118
  length_function=len,
119
  )
120
  docs = text_splitter.split_documents(data)
121
- if len(docs) == 0:
122
- data = convert_pdf_to_text(file_path)
123
- docs = text_splitter.create_documents(data)
124
 
125
  # Embed paragraphs
126
  db = FAISS.from_documents(docs, embeddings)
 
 
1
  import os
2
 
 
3
  from langchain.document_loaders import PyPDFium2Loader
4
  from langchain.embeddings.openai import OpenAIEmbeddings
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.vectorstores import FAISS
 
7
  from sllim import chat
8
 
9
  # Standard Textract client setup
 
10
  template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
11
  DOCUMENTS:
12
  {docs}
 
17
  embeddings = OpenAIEmbeddings()
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def process_file(file_path):
21
  index_path = get_index_name(file_path)
22
  if os.path.exists(index_path):
 
32
  length_function=len,
33
  )
34
  docs = text_splitter.split_documents(data)
 
 
 
35
 
36
  # Embed paragraphs
37
  db = FAISS.from_documents(docs, embeddings)
 
88
  length_function=len,
89
  )
90
  docs = text_splitter.split_documents(data)
 
 
 
91
 
92
  # Embed paragraphs
93
  db = FAISS.from_documents(docs, embeddings)