kaiserpister commited on
Commit
59122b6
·
1 Parent(s): 1a9597f

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Demo Pdfchat
3
- emoji:
4
- colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.44.3
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: demo-pdfchat
3
+ app_file: ui.py
 
 
4
  sdk: gradio
5
+ sdk_version: 3.35.2
 
 
6
  ---
 
 
__pycache__/pdfparser.cpython-310.pyc ADDED
Binary file (3.44 kB). View file
 
__pycache__/ui.cpython-310.pyc ADDED
Binary file (1.88 kB). View file
 
pdfparser.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+
4
+ import boto3
5
+ from langchain.document_loaders import PyPDFium2Loader
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.vectorstores import FAISS
9
+ from pdf2image import convert_from_path
10
+ from sllim import chat
11
+
12
+ # Standard Textract client setup
13
+ textract_client = boto3.client("textract")
14
+ template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
15
+ DOCUMENTS:
16
+ {docs}
17
+ ---
18
+ QUERY:
19
+ {query}
20
+ """
21
+ embeddings = OpenAIEmbeddings()
22
+
23
+
24
+ def convert_pdf_to_text(pdf_file_path: str):
25
+ # Convert the PDF to an in-memory image format
26
+ images = convert_from_path(pdf_file_path)
27
+
28
+ docs = []
29
+ for image in images:
30
+ # Convert the image into byte stream
31
+ with io.BytesIO() as image_stream:
32
+ image.save(image_stream, "JPEG")
33
+ image_bytes = image_stream.getvalue()
34
+
35
+ # Use Textract to detect text in the local image
36
+ response = textract_client.detect_document_text(Document={"Bytes": image_bytes})
37
+
38
+ text = ""
39
+ # Print the detected text blocks
40
+ for item in response["Blocks"]:
41
+ if item["BlockType"] == "LINE":
42
+ text += item["Text"] + "\n"
43
+ docs.append(text)
44
+ return docs
45
+
46
+
47
+ def process_file(file_path):
48
+ index_path = get_index_name(file_path)
49
+ if os.path.exists(index_path):
50
+ return
51
+
52
+ loader = PyPDFium2Loader(file_path)
53
+ data = loader.load()
54
+
55
+ # Parse text into paragraphs
56
+ text_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=1000,
58
+ chunk_overlap=50,
59
+ length_function=len,
60
+ )
61
+ docs = text_splitter.split_documents(data)
62
+ if len(docs) == 0:
63
+ data = convert_pdf_to_text(file_path)
64
+ docs = text_splitter.create_documents(data)
65
+
66
+ # Embed paragraphs
67
+ db = FAISS.from_documents(docs, embeddings)
68
+ db.save_local(index_path)
69
+
70
+
71
+ def get_index_name(file_path):
72
+ basename = os.path.splitext(os.path.basename(file_path))[0]
73
+ index_path = basename + "_faiss_index"
74
+ return index_path
75
+
76
+
77
+ def ask_question_all(history):
78
+ indices = []
79
+ docs = []
80
+
81
+ messages = []
82
+ for user, bot in history:
83
+ if not isinstance(user, str):
84
+ indices.append(get_index_name(user[0]))
85
+ elif bot:
86
+ messages.append({"role": "user", "content": user})
87
+ messages.append({"role": "assistant", "content": bot})
88
+ else:
89
+ # Handle new message
90
+ for index_path in indices:
91
+ db = FAISS.load_local(index_path, embeddings)
92
+ docs.extend(db.similarity_search(user))
93
+ messages.append(
94
+ {
95
+ "role": "user",
96
+ "content": template.format(
97
+ query=user, docs="\n".join(map(lambda x: x.page_content, docs))
98
+ ),
99
+ }
100
+ )
101
+
102
+ # send similar paragraphs with question to model
103
+ return chat(messages, model="gpt-3.5-turbo")
104
+
105
+
106
+ def ask_question(query, upload_file, history=None):
107
+ file_path = upload_file.name
108
+
109
+ index_path = get_index_name(file_path)
110
+ if not os.path.exists(index_path):
111
+ loader = PyPDFium2Loader(file_path)
112
+ data = loader.load()
113
+
114
+ # Parse text into paragraphs
115
+ text_splitter = RecursiveCharacterTextSplitter(
116
+ chunk_size=1000,
117
+ chunk_overlap=50,
118
+ length_function=len,
119
+ )
120
+ docs = text_splitter.split_documents(data)
121
+ if len(docs) == 0:
122
+ data = convert_pdf_to_text(file_path)
123
+ docs = text_splitter.create_documents(data)
124
+
125
+ # Embed paragraphs
126
+ db = FAISS.from_documents(docs, embeddings)
127
+ db.save_local(index_path)
128
+ else:
129
+ db = FAISS.load_local(index_path, embeddings)
130
+
131
+ docs = db.similarity_search(query)
132
+ messages = [
133
+ {
134
+ "role": "user",
135
+ "content": template.format(
136
+ query=query, docs="\n".join(map(lambda x: x.page_content, docs))
137
+ ),
138
+ }
139
+ ]
140
+
141
+ # send similar paragraphs with question to model
142
+ return chat(messages, model="gpt-3.5-turbo")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ sllim
2
+ openai
3
+ faiss-cpu
4
+ tiktoken
5
+ pdf2image
6
+ pypdfium2
7
+ gradio
8
+ boto3
9
+ langchain
ui.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+
5
+ from pdfparser import ask_question_all, process_file
6
+
7
+ PASSWORD = os.environ["OPEN_PASSWORD"]
8
+
9
+
10
+ def add_text(history, text):
11
+ history = history + [(text, None)]
12
+ return history, gr.update(value="", interactive=False)
13
+
14
+
15
+ def add_file(history, file):
16
+ history = history + [((file.name,), None)]
17
+ return history
18
+
19
+
20
+ def bot(history):
21
+ if history[0][0] == PASSWORD:
22
+ if len(history) == 1:
23
+ response = "Access granted."
24
+ else:
25
+ response = ask_question_all(history[1:])
26
+ else:
27
+ response = "Wrong password"
28
+ history[-1][1] = response
29
+ return history
30
+
31
+
32
+ def bot_upload(history):
33
+ if history[0][0] == PASSWORD:
34
+ process_file(history[-1][0][0])
35
+ history[-1][1] = "Ready."
36
+ else:
37
+ history[-1][1] = "Wrong password"
38
+ return history
39
+
40
+
41
+ with gr.Blocks() as demo:
42
+ chatbot = gr.Chatbot([], elem_id="chatbot").style(height=450)
43
+
44
+ with gr.Row():
45
+ with gr.Column(scale=0.85):
46
+ txt = gr.Textbox(
47
+ show_label=False,
48
+ placeholder="First upload a pdf file, then query it",
49
+ ).style(container=False)
50
+ with gr.Column(scale=0.15, min_width=0):
51
+ btn = gr.UploadButton("📁", file_types=["pdf"])
52
+
53
+ txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
54
+ bot, chatbot, chatbot
55
+ )
56
+ txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
57
+ file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(
58
+ bot_upload, chatbot, chatbot
59
+ )
60
+
61
+ demo.launch()