qorgh346 commited on
Commit
92aef63
·
1 Parent(s): a5173f3

update loader module

Browse files
Files changed (1) hide show
  1. app.py +48 -41
app.py CHANGED
@@ -10,7 +10,7 @@ from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp,CTransformers # For loading transformer models.
13
- from langchain.document_loaders import PyPDFLoader
14
  from tempfile import NamedTemporaryFile
15
  def get_pdf_text(pdf_docs):
16
  # text = ''
@@ -32,6 +32,52 @@ def get_pdf_text(pdf_docs):
32
  return pdf_doc
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def get_text_chunks(documents):
36
 
37
  text_splitter = RecursiveCharacterTextSplitter(
@@ -99,45 +145,6 @@ def handle_userinput(user_question):
99
  st.write(bot_template.replace(
100
  "{{MSG}}", message.content), unsafe_allow_html=True)
101
 
102
- def get_text_file(docs):
103
- text = docs.read().decode("utf-8")
104
- return text
105
-
106
- def get_csv_file(docs):
107
- import pandas as pd
108
- text = ''
109
-
110
- data = pd.read_csv(docs)
111
-
112
- for index, row in data.iterrows():
113
- item_name = row[0]
114
- row_text = item_name
115
- for col_name in data.columns[1:]:
116
- row_text += '{} is {} '.format(col_name, row[col_name])
117
- text += row_text + '\n'
118
-
119
- return text
120
-
121
- def get_json_file(docs):
122
- import json
123
- text = ''
124
- # with open(docs, 'r') as f:
125
- json_data = json.load(docs)
126
-
127
- for f_key, f_value in json_data.items():
128
- for s_value in f_value:
129
- text += str(f_key) + str(s_value)
130
- text += '\n'
131
- #print(text)
132
- return text
133
-
134
- def get_hwp_file(docs):
135
- pass
136
-
137
- def get_docs_file(docs):
138
- pass
139
-
140
-
141
  def main():
142
  load_dotenv()
143
  st.set_page_config(page_title="Chat with multiple PDFs",
@@ -170,7 +177,7 @@ def main():
170
  raw_text += get_text_file(file)
171
  elif file.type in ['application/octet-stream', 'application/pdf']:
172
  #file is .pdf
173
- doc_list.append(get_pdf_text(file))
174
  elif file.type == 'text/csv':
175
  #file is .csv
176
  raw_text += get_csv_file(file)
 
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp,CTransformers # For loading transformer models.
13
+ from langchain.document_loaders import PyPDFLoader,TextLoader, JSONLoader, CSVLoader
14
  from tempfile import NamedTemporaryFile
15
  def get_pdf_text(pdf_docs):
16
  # text = ''
 
32
  return pdf_doc
33
 
34
 
35
+ def get_text_file(docs):
36
+
37
+ with NamedTemporaryFile() as temp_file:
38
+ temp_file.write(pdf_docs.getvalue())
39
+ temp_file.seek(0)
40
+ text_loader = TextLoader(temp_file.name)
41
+ text_doc = text_loader.load()
42
+
43
+ return text_doc
44
+
45
+
46
+
47
+ def get_csv_file(docs):
48
+ import pandas as pd
49
+ text = ''
50
+
51
+ data = pd.read_csv(docs)
52
+
53
+ for index, row in data.iterrows():
54
+ item_name = row[0]
55
+ row_text = item_name
56
+ for col_name in data.columns[1:]:
57
+ row_text += '{} is {} '.format(col_name, row[col_name])
58
+ text += row_text + '\n'
59
+
60
+ return text
61
+
62
+ def get_json_file(docs):
63
+ with NamedTemporaryFile() as temp_file:
64
+ temp_file.write(docs.getvalue())
65
+ temp_file.seek(0)
66
+ json_loader = JSONLoader(temp_file.name)
67
+ json_doc = json_loader.load()
68
+
69
+ return json_doc
70
+
71
+ def get_hwp_file(docs):
72
+ pass
73
+
74
+ def get_docs_file(docs):
75
+ pass
76
+
77
+
78
+
79
+
80
+
81
  def get_text_chunks(documents):
82
 
83
  text_splitter = RecursiveCharacterTextSplitter(
 
145
  st.write(bot_template.replace(
146
  "{{MSG}}", message.content), unsafe_allow_html=True)
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  def main():
149
  load_dotenv()
150
  st.set_page_config(page_title="Chat with multiple PDFs",
 
177
  raw_text += get_text_file(file)
178
  elif file.type in ['application/octet-stream', 'application/pdf']:
179
  #file is .pdf
180
+ doc_list.extend(get_pdf_text(file))
181
  elif file.type == 'text/csv':
182
  #file is .csv
183
  raw_text += get_csv_file(file)