jungwoo3490 commited on
Commit
1315281
Β·
1 Parent(s): c079d9b

edit app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -51
app.py CHANGED
@@ -1,90 +1,100 @@
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
- from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
- from langchain.vectorstores import FAISS, Chroma
7
  from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
8
- from langchain.chat_models import ChatOpenAI
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
- from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
- import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
15
  import os
 
16
 
17
 
18
  # PDF λ¬Έμ„œλ‘œλΆ€ν„° ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
19
  def get_pdf_text(pdf_docs):
20
- temp_dir = tempfile.TemporaryDirectory() # μž„μ‹œ 디렉토리λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
21
- temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # μž„μ‹œ 파일 경둜λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
22
  with open(temp_filepath, "wb") as f: # μž„μ‹œ νŒŒμΌμ„ λ°”μ΄λ„ˆλ¦¬ μ“°κΈ° λͺ¨λ“œλ‘œ μ—½λ‹ˆλ‹€.
23
- f.write(pdf_docs.getvalue()) # PDF λ¬Έμ„œμ˜ λ‚΄μš©μ„ μž„μ‹œ νŒŒμΌμ— μ”λ‹ˆλ‹€.
24
- pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoaderλ₯Ό μ‚¬μš©ν•΄ PDFλ₯Ό λ‘œλ“œν•©λ‹ˆλ‹€.
25
- pdf_doc = pdf_loader.load() # ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
26
- return pdf_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
27
 
28
  # 과제
29
  # μ•„λž˜ ν…μŠ€νŠΈ μΆ”μΆœ ν•¨μˆ˜λ₯Ό μž‘μ„±
 
 
 
30
 
 
 
31
 
32
- def get_text_file(text_docs):
33
- temp_dir = tempfile.TemporaryDirectory()
34
- temp_filepath = os.path.join(temp_dir.name, text_docs.name)
35
- with open(temp_filepath, "wb") as f:
36
- f.write(text_docs.getvalue())
37
- text_loader = TextLoader(temp_filepath)
38
- text_doc = text_loader.load()
39
- return text_doc
40
 
41
 
42
- def get_csv_file(csv_docs):
43
  temp_dir = tempfile.TemporaryDirectory()
44
- temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
45
- with open(temp_filepath, "wb") as f:
46
- f.write(csv_docs.getvalue())
 
 
47
  csv_loader = CSVLoader(temp_filepath)
48
  csv_doc = csv_loader.load()
 
49
  return csv_doc
50
 
51
 
52
- def get_json_file(json_docs):
53
  temp_dir = tempfile.TemporaryDirectory()
54
- temp_filepath = os.path.join(temp_dir.name, json_docs.name)
55
- with open(temp_filepath, "wb") as f:
56
- f.write(json_docs.getvalue())
57
- json_loader = JSONLoader(temp_filepath)
 
 
58
  json_doc = json_loader.load()
 
59
  return json_doc
60
 
61
-
62
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
63
  def get_text_chunks(documents):
64
  text_splitter = RecursiveCharacterTextSplitter(
65
- chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
66
- chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
67
- length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
68
  )
69
 
70
- documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€
71
- return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
72
 
73
 
74
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
75
  def get_vectorstore(text_chunks):
76
- # OpenAI μž„λ² λ”© λͺ¨λΈμ„ λ‘œλ“œν•©λ‹ˆλ‹€. (Embedding models - Ada v2)
77
-
78
- embeddings = OpenAIEmbeddings()
79
- vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
80
-
81
- return vectorstore # μƒμ„±λœ 벑터 μŠ€ν† μ–΄λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
82
 
83
 
84
  def get_conversation_chain(vectorstore):
85
- gpt_model_name = 'gpt-3.5-turbo'
86
- llm = ChatOpenAI(model_name = gpt_model_name) #gpt-3.5 λͺ¨λΈ λ‘œλ“œ
87
-
 
 
 
 
 
88
  # λŒ€ν™” 기둝을 μ €μž₯ν•˜κΈ° μœ„ν•œ λ©”λͺ¨λ¦¬λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
89
  memory = ConversationBufferMemory(
90
  memory_key='chat_history', return_messages=True)
@@ -94,10 +104,12 @@ def get_conversation_chain(vectorstore):
94
  retriever=vectorstore.as_retriever(),
95
  memory=memory
96
  )
97
- return conversation_chain
 
98
 
99
  # μ‚¬μš©μž μž…λ ₯을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
100
  def handle_userinput(user_question):
 
101
  # λŒ€ν™” 체인을 μ‚¬μš©ν•˜μ—¬ μ‚¬μš©μž μ§ˆλ¬Έμ— λŒ€ν•œ 응닡을 μƒμ„±ν•©λ‹ˆλ‹€.
102
  response = st.session_state.conversation({'question': user_question})
103
  # λŒ€ν™” 기둝을 μ €μž₯ν•©λ‹ˆλ‹€.
@@ -123,16 +135,12 @@ def main():
123
  if "chat_history" not in st.session_state:
124
  st.session_state.chat_history = None
125
 
126
- st.header("Chat with multiple Files :")
127
  user_question = st.text_input("Ask a question about your documents:")
128
  if user_question:
129
  handle_userinput(user_question)
130
 
131
  with st.sidebar:
132
- openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
133
- if openai_key:
134
- os.environ["OPENAI_API_KEY"] = openai_key
135
-
136
  st.subheader("Your documents")
137
  docs = st.file_uploader(
138
  "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
@@ -168,4 +176,4 @@ def main():
168
 
169
 
170
  if __name__ == '__main__':
171
- main()
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
 
3
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import FAISS
 
5
  from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
 
6
  from langchain.memory import ConversationBufferMemory
7
  from langchain.chains import ConversationalRetrievalChain
8
  from htmlTemplates import css, bot_template, user_template
9
+ from langchain.llms import LlamaCpp # For loading transformer models.
10
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
11
+ import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
12
  import os
13
+ from huggingface_hub import hf_hub_download # Hugging Face Hubμ—μ„œ λͺ¨λΈμ„ λ‹€μš΄λ‘œλ“œν•˜κΈ° μœ„ν•œ ν•¨μˆ˜μž…λ‹ˆλ‹€.
14
 
15
 
16
  # PDF λ¬Έμ„œλ‘œλΆ€ν„° ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
17
  def get_pdf_text(pdf_docs):
18
+ temp_dir = tempfile.TemporaryDirectory() # μž„μ‹œ 디렉토리λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
19
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # μž„μ‹œ 파일 경둜λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
20
  with open(temp_filepath, "wb") as f: # μž„μ‹œ νŒŒμΌμ„ λ°”μ΄λ„ˆλ¦¬ μ“°κΈ° λͺ¨λ“œλ‘œ μ—½λ‹ˆλ‹€.
21
+ f.write(pdf_docs.getvalue()) # PDF λ¬Έμ„œμ˜ λ‚΄μš©μ„ μž„μ‹œ νŒŒμΌμ— μ”λ‹ˆλ‹€.
22
+ pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoaderλ₯Ό μ‚¬μš©ν•΄ PDFλ₯Ό λ‘œλ“œν•©λ‹ˆλ‹€.
23
+ pdf_doc = pdf_loader.load() # ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
24
+ return pdf_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
25
+
26
 
27
  # 과제
28
  # μ•„λž˜ ν…μŠ€νŠΈ μΆ”μΆœ ν•¨μˆ˜λ₯Ό μž‘μ„±
29
+ def get_text_file(docs):
30
+ temp_dir = tempfile.TemporaryDirectory()
31
+ temp_filepath = os.path.join(temp_dir.name, docs.name)
32
 
33
+ with open(temp_filepath, 'wb') as f:
34
+ f.write(docs.getvalue())
35
 
36
+ txt_loader = TextLoader(temp_filepath)
37
+ txt_doc = txt_loader.load()
38
+
39
+ return txt_doc
 
 
 
 
40
 
41
 
42
+ def get_csv_file(docs):
43
  temp_dir = tempfile.TemporaryDirectory()
44
+ temp_filepath = os.path.join(temp_dir.name, docs.name)
45
+
46
+ with open(temp_filepath, 'wb') as f:
47
+ f.write(docs.getvalue())
48
+
49
  csv_loader = CSVLoader(temp_filepath)
50
  csv_doc = csv_loader.load()
51
+
52
  return csv_doc
53
 
54
 
55
+ def get_json_file(docs):
56
  temp_dir = tempfile.TemporaryDirectory()
57
+ temp_filepath = os.path.join(temp_dir.name, docs.name)
58
+
59
+ with open(temp_filepath, 'wb') as f:
60
+ f.write(docs.getvalue())
61
+
62
+ json_loader = JSONLoader(temp_filepath, jq_scheama="사원")
63
  json_doc = json_loader.load()
64
+
65
  return json_doc
66
 
67
+
68
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
69
  def get_text_chunks(documents):
70
  text_splitter = RecursiveCharacterTextSplitter(
71
+ chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
72
+ chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
73
+ length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
74
  )
75
 
76
+ documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•οΏ½οΏ½λ‹€.
77
+ return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
78
 
79
 
80
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
81
  def get_vectorstore(text_chunks):
82
+ # μ›ν•˜λŠ” μž„λ² λ”© λͺ¨λΈμ„ λ‘œλ“œν•©λ‹ˆλ‹€.
83
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L12-v2',
84
+ model_kwargs={'device': 'cpu'}) # μž„λ² λ”© λͺ¨λΈμ„ μ„€μ •ν•©λ‹ˆλ‹€.
85
+ vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
86
+ return vectorstore # μƒμ„±λœ 벑터 μŠ€ν† μ–΄λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
87
 
88
 
89
  def get_conversation_chain(vectorstore):
90
+ model_name_or_path = 'TheBloke/Llama-2-7B-chat-GGUF'
91
+ model_basename = 'llama-2-7b-chat.Q2_K.gguf'
92
+ model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
93
+
94
+ llm = LlamaCpp(model_path=model_path,
95
+ n_ctx=4086,
96
+ input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
97
+ verbose=True, )
98
  # λŒ€ν™” 기둝을 μ €μž₯ν•˜κΈ° μœ„ν•œ λ©”λͺ¨λ¦¬λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
99
  memory = ConversationBufferMemory(
100
  memory_key='chat_history', return_messages=True)
 
104
  retriever=vectorstore.as_retriever(),
105
  memory=memory
106
  )
107
+ return conversation_chain # μƒμ„±λœ λŒ€ν™” 체인을 λ°˜ν™˜ν•©λ‹ˆλ‹€.
108
+
109
 
110
  # μ‚¬μš©μž μž…λ ₯을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
111
  def handle_userinput(user_question):
112
+ print('user_question => ', user_question)
113
  # λŒ€ν™” 체인을 μ‚¬μš©ν•˜μ—¬ μ‚¬μš©μž μ§ˆλ¬Έμ— λŒ€ν•œ 응닡을 μƒμ„±ν•©λ‹ˆλ‹€.
114
  response = st.session_state.conversation({'question': user_question})
115
  # λŒ€ν™” 기둝을 μ €μž₯ν•©λ‹ˆλ‹€.
 
135
  if "chat_history" not in st.session_state:
136
  st.session_state.chat_history = None
137
 
138
+ st.header("Chat with multiple Files:")
139
  user_question = st.text_input("Ask a question about your documents:")
140
  if user_question:
141
  handle_userinput(user_question)
142
 
143
  with st.sidebar:
 
 
 
 
144
  st.subheader("Your documents")
145
  docs = st.file_uploader(
146
  "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
 
176
 
177
 
178
  if __name__ == '__main__':
179
+ main()