Spaces:
Runtime error
Runtime error
qorgh346
commited on
Commit
·
92aef63
1
Parent(s):
a5173f3
update loader module
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ from langchain.memory import ConversationBufferMemory
|
|
10 |
from langchain.chains import ConversationalRetrievalChain
|
11 |
from htmlTemplates import css, bot_template, user_template
|
12 |
from langchain.llms import HuggingFaceHub, LlamaCpp,CTransformers # For loading transformer models.
|
13 |
-
from langchain.document_loaders import PyPDFLoader
|
14 |
from tempfile import NamedTemporaryFile
|
15 |
def get_pdf_text(pdf_docs):
|
16 |
# text = ''
|
@@ -32,6 +32,52 @@ def get_pdf_text(pdf_docs):
|
|
32 |
return pdf_doc
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def get_text_chunks(documents):
|
36 |
|
37 |
text_splitter = RecursiveCharacterTextSplitter(
|
@@ -99,45 +145,6 @@ def handle_userinput(user_question):
|
|
99 |
st.write(bot_template.replace(
|
100 |
"{{MSG}}", message.content), unsafe_allow_html=True)
|
101 |
|
102 |
-
def get_text_file(docs):
|
103 |
-
text = docs.read().decode("utf-8")
|
104 |
-
return text
|
105 |
-
|
106 |
-
def get_csv_file(docs):
|
107 |
-
import pandas as pd
|
108 |
-
text = ''
|
109 |
-
|
110 |
-
data = pd.read_csv(docs)
|
111 |
-
|
112 |
-
for index, row in data.iterrows():
|
113 |
-
item_name = row[0]
|
114 |
-
row_text = item_name
|
115 |
-
for col_name in data.columns[1:]:
|
116 |
-
row_text += '{} is {} '.format(col_name, row[col_name])
|
117 |
-
text += row_text + '\n'
|
118 |
-
|
119 |
-
return text
|
120 |
-
|
121 |
-
def get_json_file(docs):
|
122 |
-
import json
|
123 |
-
text = ''
|
124 |
-
# with open(docs, 'r') as f:
|
125 |
-
json_data = json.load(docs)
|
126 |
-
|
127 |
-
for f_key, f_value in json_data.items():
|
128 |
-
for s_value in f_value:
|
129 |
-
text += str(f_key) + str(s_value)
|
130 |
-
text += '\n'
|
131 |
-
#print(text)
|
132 |
-
return text
|
133 |
-
|
134 |
-
def get_hwp_file(docs):
|
135 |
-
pass
|
136 |
-
|
137 |
-
def get_docs_file(docs):
|
138 |
-
pass
|
139 |
-
|
140 |
-
|
141 |
def main():
|
142 |
load_dotenv()
|
143 |
st.set_page_config(page_title="Chat with multiple PDFs",
|
@@ -170,7 +177,7 @@ def main():
|
|
170 |
raw_text += get_text_file(file)
|
171 |
elif file.type in ['application/octet-stream', 'application/pdf']:
|
172 |
#file is .pdf
|
173 |
-
doc_list.
|
174 |
elif file.type == 'text/csv':
|
175 |
#file is .csv
|
176 |
raw_text += get_csv_file(file)
|
|
|
10 |
from langchain.chains import ConversationalRetrievalChain
|
11 |
from htmlTemplates import css, bot_template, user_template
|
12 |
from langchain.llms import HuggingFaceHub, LlamaCpp,CTransformers # For loading transformer models.
|
13 |
+
from langchain.document_loaders import PyPDFLoader,TextLoader, JSONLoader, CSVLoader
|
14 |
from tempfile import NamedTemporaryFile
|
15 |
def get_pdf_text(pdf_docs):
|
16 |
# text = ''
|
|
|
32 |
return pdf_doc
|
33 |
|
34 |
|
35 |
+
def get_text_file(docs):
|
36 |
+
|
37 |
+
with NamedTemporaryFile() as temp_file:
|
38 |
+
temp_file.write(pdf_docs.getvalue())
|
39 |
+
temp_file.seek(0)
|
40 |
+
text_loader = TextLoader(temp_file.name)
|
41 |
+
text_doc = text_loader.load()
|
42 |
+
|
43 |
+
return text_doc
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
def get_csv_file(docs):
|
48 |
+
import pandas as pd
|
49 |
+
text = ''
|
50 |
+
|
51 |
+
data = pd.read_csv(docs)
|
52 |
+
|
53 |
+
for index, row in data.iterrows():
|
54 |
+
item_name = row[0]
|
55 |
+
row_text = item_name
|
56 |
+
for col_name in data.columns[1:]:
|
57 |
+
row_text += '{} is {} '.format(col_name, row[col_name])
|
58 |
+
text += row_text + '\n'
|
59 |
+
|
60 |
+
return text
|
61 |
+
|
62 |
+
def get_json_file(docs):
|
63 |
+
with NamedTemporaryFile() as temp_file:
|
64 |
+
temp_file.write(docs.getvalue())
|
65 |
+
temp_file.seek(0)
|
66 |
+
json_loader = JSONLoader(temp_file.name)
|
67 |
+
json_doc = json_loader.load()
|
68 |
+
|
69 |
+
return json_doc
|
70 |
+
|
71 |
+
def get_hwp_file(docs):
|
72 |
+
pass
|
73 |
+
|
74 |
+
def get_docs_file(docs):
|
75 |
+
pass
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
def get_text_chunks(documents):
|
82 |
|
83 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
145 |
st.write(bot_template.replace(
|
146 |
"{{MSG}}", message.content), unsafe_allow_html=True)
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
def main():
|
149 |
load_dotenv()
|
150 |
st.set_page_config(page_title="Chat with multiple PDFs",
|
|
|
177 |
raw_text += get_text_file(file)
|
178 |
elif file.type in ['application/octet-stream', 'application/pdf']:
|
179 |
#file is .pdf
|
180 |
+
doc_list.extend(get_pdf_text(file))
|
181 |
elif file.type == 'text/csv':
|
182 |
#file is .csv
|
183 |
raw_text += get_csv_file(file)
|