RubenAMtz commited on
Commit
70c2a60
·
1 Parent(s): 31cbd5c

pdfloader, text cleaning, vector store, context in prompt

Browse files
Files changed (3) hide show
  1. aimakerspace/text_utils.py +60 -1
  2. app.py +49 -3
  3. requirements.txt +2 -1
aimakerspace/text_utils.py CHANGED
@@ -1,5 +1,9 @@
1
  import os
2
- from typing import List
 
 
 
 
3
 
4
 
5
  class TextFileLoader:
@@ -34,6 +38,61 @@ class TextFileLoader:
34
  def load_documents(self):
35
  self.load()
36
  return self.documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  class CharacterTextSplitter:
 
1
  import os
2
+ from typing import List, Union
3
+ from pdfminer.high_level import extract_text
4
+ import io
5
+ from chainlit.types import AskFileResponse
6
+ import re
7
 
8
 
9
  class TextFileLoader:
 
38
  def load_documents(self):
39
  self.load()
40
  return self.documents
41
+
42
+ class PDFFileLoader(TextFileLoader):
43
+ def __init__(self, path: str, encoding: str = "utf-8", content=None, files: list[AskFileResponse] = None):
44
+ super().__init__(path, encoding)
45
+ self.content = content
46
+ self.files = files
47
+
48
+ def load(self):
49
+ if isinstance(self.files, List):
50
+ for file in self.files:
51
+ if file.content and file.path.endswith(".pdf"):
52
+ self.content = file.content
53
+ self.load_content()
54
+ elif os.path.isdir(self.path):
55
+ self.load_directory()
56
+ elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
57
+ print("loading file ...")
58
+ self.load_file()
59
+ elif self.content and self.path.endswith(".pdf"):
60
+ print("loading content ...")
61
+ self.load_content()
62
+ else:
63
+ raise ValueError(
64
+ "Provided path is neither a valid directory nor a .pdf file."
65
+ )
66
+
67
+ def load_content(self):
68
+ """Load pdf already in memory"""
69
+ text = extract_text(io.BytesIO(self.content))
70
+ text = self.clean_text(text)
71
+ self.documents.append(text)
72
+
73
+ def clean_text(self, text):
74
+ """Clean text by removing special characters."""
75
+ # remove all \n
76
+ text = text.replace('\n', ' ')
77
+ text = re.sub(' +', ' ', text)
78
+ # remove page number, we find it because it appears before '\x0c', use regex to find it
79
+ text = re.sub(r'\d+ \x0c', '\x0c', text)
80
+ # remove all '\x0c'
81
+ text = text.replace('\x0c', ' ')
82
+ return text
83
+
84
+ def load_file(self):
85
+ text = extract_text(pdf_file=self.path, codec=self.encoding)
86
+ self.documents.append(text)
87
+
88
+ def load_directory(self):
89
+ for root, _, files in os.walk(self.path):
90
+ for file in files:
91
+ if file.endswith(".pdf"):
92
+ self.documents.append(
93
+ extract_text(os.path.join(root, file), encoding=self.encoding)
94
+ )
95
+
96
 
97
 
98
  class CharacterTextSplitter:
app.py CHANGED
@@ -7,6 +7,9 @@ import chainlit as cl # importing chainlit for our app
7
  from chainlit.prompt import Prompt, PromptMessage # importing prompt tools
8
  from chainlit.playground.providers import ChatOpenAI # importing ChatOpenAI tools
9
  from dotenv import load_dotenv
 
 
 
10
 
11
  load_dotenv()
12
 
@@ -18,6 +21,15 @@ user_template = """{input}
18
  Think through your response step by step.
19
  """
20
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  @cl.on_chat_start # marks a function that will be executed at the start of a user session
@@ -44,26 +56,52 @@ async def start_chat():
44
  ).send()
45
 
46
  # let the user know you are processing the file(s)
 
 
 
47
 
48
  # decode the file
 
49
 
50
  # split the text into chunks
 
 
 
 
 
 
51
 
52
  # create a vector store
 
 
 
 
53
 
54
- #
55
-
 
 
 
 
56
 
 
57
 
58
 
59
  @cl.on_message # marks a function that should be run each time the chatbot receives a message from a user
60
  async def main(message: cl.Message):
 
61
  settings = cl.user_session.get("settings")
62
 
63
  client = AsyncOpenAI()
64
 
65
  print(message.content)
66
 
 
 
 
 
 
 
67
  prompt = Prompt(
68
  provider=ChatOpenAI.id,
69
  messages=[
@@ -77,8 +115,16 @@ async def main(message: cl.Message):
77
  template=user_template,
78
  formatted=user_template.format(input=message.content),
79
  ),
 
 
 
 
 
80
  ],
81
- inputs={"input": message.content},
 
 
 
82
  settings=settings,
83
  )
84
 
 
7
  from chainlit.prompt import Prompt, PromptMessage # importing prompt tools
8
  from chainlit.playground.providers import ChatOpenAI # importing ChatOpenAI tools
9
  from dotenv import load_dotenv
10
+ from aimakerspace.text_utils import PDFFileLoader, CharacterTextSplitter
11
+ from aimakerspace.vectordatabase import VectorDatabase
12
+ import asyncio
13
 
14
  load_dotenv()
15
 
 
21
  Think through your response step by step.
22
  """
23
 
24
+ assistant_template = """Use the following context, if any, to help you
25
+ answer the user's input, if the answer is not in the context say you don't
26
+ know the answer.
27
+ CONTEXT:
28
+ ===============
29
+ {context}
30
+ ===============
31
+ """
32
+
33
 
34
 
35
  @cl.on_chat_start # marks a function that will be executed at the start of a user session
 
56
  ).send()
57
 
58
  # let the user know you are processing the file(s)
59
+ await cl.Message(
60
+ content="Loading your files..."
61
+ ).send()
62
 
63
  # decode the file
64
+ documents = PDFFileLoader(path="", files=files).load_documents()
65
 
66
  # split the text into chunks
67
+ chunks = CharacterTextSplitter(
68
+ chunk_size=1000,
69
+ chunk_overlap=200
70
+ ).split_texts(documents)
71
+
72
+ print(chunks[0])
73
 
74
  # create a vector store
75
+ # let the user know you are processing the document(s)
76
+ await cl.Message(
77
+ content="Creating vector store"
78
+ ).send()
79
 
80
+ vector_db = VectorDatabase()
81
+ vector_db = await vector_db.abuild_from_list(chunks)
82
+
83
+ await cl.Message(
84
+ content="Done"
85
+ ).send()
86
 
87
+ cl.user_session.set("vector_db", vector_db)
88
 
89
 
90
  @cl.on_message # marks a function that should be run each time the chatbot receives a message from a user
91
  async def main(message: cl.Message):
92
+ vector_db = cl.user_session.get("vector_db")
93
  settings = cl.user_session.get("settings")
94
 
95
  client = AsyncOpenAI()
96
 
97
  print(message.content)
98
 
99
+ results_list = vector_db.search_by_text(query_text=message.content, k=3, return_as_text=True)
100
+ if results_list:
101
+ results_string = "\n\n".join(results_list)
102
+ else:
103
+ results_string = ""
104
+
105
  prompt = Prompt(
106
  provider=ChatOpenAI.id,
107
  messages=[
 
115
  template=user_template,
116
  formatted=user_template.format(input=message.content),
117
  ),
118
+ PromptMessage(
119
+ role="assistant",
120
+ template=assistant_template,
121
+ formatted=assistant_template.format(context=results_string)
122
+ )
123
  ],
124
+ inputs={
125
+ "input": message.content,
126
+ "context": results_string
127
+ },
128
  settings=settings,
129
  )
130
 
requirements.txt CHANGED
@@ -7,4 +7,5 @@ numpy==1.25.2
7
  pandas
8
  scikit-learn
9
  matplotlib
10
- plotly
 
 
7
  pandas
8
  scikit-learn
9
  matplotlib
10
+ plotly
11
+ pdfminer.six