Upload 18 files
Browse files- .gitattributes +5 -0
- __pycache__/pdf_viewer_component.cpython-312.pyc +0 -0
- __pycache__/rag_system.cpython-312.pyc +0 -0
- app.py +175 -0
- chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/data_level0.bin +3 -0
- chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/header.bin +3 -0
- chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/index_metadata.pickle +3 -0
- chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/length.bin +3 -0
- chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/link_lists.bin +3 -0
- chroma_db/chroma.sqlite3 +3 -0
- documents/(최종본) 생성형 AI 저작권 안내서.pdf +3 -0
- documents/1. (자료집)생성형 AI 교육자료-ChatGPT 사례 중심으로).pdf +3 -0
- documents/docs1.pdf +3 -0
- documents/docs2.pdf +0 -0
- documents/챗GPT등 생성형 AI 활용 보안 가이드라인.pdf +3 -0
- get-pip.py +0 -0
- pdf_viewer_component.py +23 -0
- rag_system.py +122 -0
- requirements.txt +35 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
documents/(최종본)[[:space:]]생성형[[:space:]]AI[[:space:]]저작권[[:space:]]안내서.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
documents/1.[[:space:]](자료집)생성형[[:space:]]AI[[:space:]]교육자료-ChatGPT[[:space:]]사례[[:space:]]중심으로).pdf filter=lfs diff=lfs merge=lfs -text
|
39 |
+
documents/챗GPT등[[:space:]]생성형[[:space:]]AI[[:space:]]활용[[:space:]]보안[[:space:]]가이드라인.pdf filter=lfs diff=lfs merge=lfs -text
|
40 |
+
documents/docs1.pdf filter=lfs diff=lfs merge=lfs -text
|
__pycache__/pdf_viewer_component.cpython-312.pyc
ADDED
Binary file (1.58 kB). View file
|
|
__pycache__/rag_system.cpython-312.pyc
ADDED
Binary file (6.92 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from rag_system import load_retrieval_qa_chain, get_answer, update_embeddings
|
5 |
+
import json
|
6 |
+
import re
|
7 |
+
from PyPDF2 import PdfReader
|
8 |
+
from PIL import Image
|
9 |
+
import io
|
10 |
+
from pydantic_settings import BaseSettings
|
11 |
+
|
12 |
+
# Load environment variables
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
# Set OpenAI API key
|
16 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
17 |
+
os.environ["OPENAI_API_KEY"] = openai_api_key
|
18 |
+
|
19 |
+
# Ensure the static directory exists
|
20 |
+
static_directory = "static"
|
21 |
+
if not os.path.exists(static_directory):
|
22 |
+
os.makedirs(static_directory)
|
23 |
+
|
24 |
+
# PDF utility functions
|
25 |
+
def get_pdf_page_count(file_path):
|
26 |
+
with open(file_path, 'rb') as file:
|
27 |
+
pdf = PdfReader(file)
|
28 |
+
return len(pdf.pages)
|
29 |
+
|
30 |
+
def render_pdf_page(file_path, page_num):
|
31 |
+
import fitz # PyMuPDF
|
32 |
+
doc = fitz.open(file_path)
|
33 |
+
page = doc.load_page(page_num - 1) # page numbers start from 0
|
34 |
+
pix = page.get_pixmap()
|
35 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
36 |
+
return img
|
37 |
+
|
38 |
+
# Load PDF data
|
39 |
+
def load_pdf_data():
|
40 |
+
pdf_data = {}
|
41 |
+
pdf_files = [f for f in os.listdir("./documents") if f.endswith('.pdf')]
|
42 |
+
for pdf_file in pdf_files:
|
43 |
+
file_path = f"./documents/{pdf_file}"
|
44 |
+
pdf_data[pdf_file] = {
|
45 |
+
'path': file_path,
|
46 |
+
'num_pages': get_pdf_page_count(file_path)
|
47 |
+
}
|
48 |
+
return pdf_data
|
49 |
+
|
50 |
+
# Update embeddings with new documents
|
51 |
+
update_embeddings()
|
52 |
+
|
53 |
+
# Load vector store and PDF data
|
54 |
+
qa_chain = load_retrieval_qa_chain()
|
55 |
+
pdf_data = load_pdf_data()
|
56 |
+
|
57 |
+
def pdf_viewer_interface(pdf_state, page_number, action=None, page_input=None):
|
58 |
+
selected_pdf = pdf_state['selected_pdf']
|
59 |
+
current_page = page_number
|
60 |
+
max_pages = pdf_data[selected_pdf]['num_pages']
|
61 |
+
|
62 |
+
if action == "prev":
|
63 |
+
current_page = max(1, current_page - 1)
|
64 |
+
elif action == "next":
|
65 |
+
current_page = min(max_pages, current_page + 1)
|
66 |
+
elif page_input is not None:
|
67 |
+
try:
|
68 |
+
current_page = int(page_input)
|
69 |
+
current_page = max(1, min(current_page, max_pages))
|
70 |
+
except ValueError:
|
71 |
+
pass
|
72 |
+
|
73 |
+
pdf_state['page_number'] = current_page
|
74 |
+
pdf_path = pdf_data[selected_pdf]['path']
|
75 |
+
img = render_pdf_page(pdf_path, current_page)
|
76 |
+
return img, current_page, str(current_page)
|
77 |
+
|
78 |
+
def chat_interface(user_input, chat_history, pdf_state):
|
79 |
+
chat_history_list = [item for sublist in chat_history for item in sublist]
|
80 |
+
|
81 |
+
response = get_answer(qa_chain, user_input, chat_history_list)
|
82 |
+
full_response = response["answer"]
|
83 |
+
sources = response["sources"]
|
84 |
+
|
85 |
+
chat_history.append((user_input, full_response))
|
86 |
+
return chat_history, sources
|
87 |
+
|
88 |
+
def handle_source_click(evt: gr.SelectData, sources, pdf_state, page_number):
|
89 |
+
index = evt.index[0] if isinstance(evt.index, list) else evt.index
|
90 |
+
|
91 |
+
if index >= len(sources):
|
92 |
+
return None, pdf_state, page_number, ""
|
93 |
+
|
94 |
+
source = sources[index]
|
95 |
+
file_name, page_str = source.split(' (Page ')
|
96 |
+
page_str = page_str.rstrip(')')
|
97 |
+
page = int(page_str)
|
98 |
+
|
99 |
+
if file_name not in pdf_data:
|
100 |
+
return None, pdf_state, page_number, ""
|
101 |
+
|
102 |
+
pdf_state['selected_pdf'] = file_name
|
103 |
+
pdf_state['page_number'] = page
|
104 |
+
pdf_path = pdf_data[file_name]['path']
|
105 |
+
img = render_pdf_page(pdf_path, page)
|
106 |
+
return img, pdf_state, page, str(page)
|
107 |
+
|
108 |
+
with gr.Blocks() as demo:
|
109 |
+
initial_pdf = list(pdf_data.keys())[0]
|
110 |
+
pdf_state = gr.State({'selected_pdf': initial_pdf, 'page_number': 1})
|
111 |
+
sources = gr.State([])
|
112 |
+
page_number = gr.State(1)
|
113 |
+
|
114 |
+
with gr.Row():
|
115 |
+
with gr.Column(scale=3):
|
116 |
+
chat_history = gr.State([])
|
117 |
+
chatbot = gr.Chatbot()
|
118 |
+
user_input = gr.Textbox(show_label=False, placeholder="Enter your question...")
|
119 |
+
source_list = gr.Dataframe(
|
120 |
+
headers=["Source", "Page"],
|
121 |
+
datatype=["str", "number"],
|
122 |
+
row_count=4,
|
123 |
+
col_count=2,
|
124 |
+
interactive=False,
|
125 |
+
label="Sources"
|
126 |
+
)
|
127 |
+
|
128 |
+
with gr.Column(scale=2):
|
129 |
+
pdf_dropdown = gr.Dropdown(choices=list(pdf_data.keys()), label="Select PDF", value=initial_pdf)
|
130 |
+
pdf_viewer = gr.Image(label="PDF Viewer", height=600)
|
131 |
+
pdf_page = gr.Number(label="Page Number", value=1)
|
132 |
+
with gr.Row():
|
133 |
+
prev_button = gr.Button("Previous Page")
|
134 |
+
next_button = gr.Button("Next Page")
|
135 |
+
|
136 |
+
user_input.submit(chat_interface, [user_input, chat_history, pdf_state], [chatbot, sources]).then(
|
137 |
+
lambda s: [[src.split(' (Page ')[0], int(src.split(' (Page ')[1].rstrip(')'))] for src in s],
|
138 |
+
inputs=[sources],
|
139 |
+
outputs=[source_list]
|
140 |
+
)
|
141 |
+
|
142 |
+
source_list.select(handle_source_click, [sources, pdf_state, page_number], [pdf_viewer, pdf_state, page_number, pdf_page])
|
143 |
+
|
144 |
+
pdf_dropdown.change(
|
145 |
+
lambda x: {'selected_pdf': x, 'page_number': 1},
|
146 |
+
inputs=[pdf_dropdown],
|
147 |
+
outputs=[pdf_state]
|
148 |
+
).then(
|
149 |
+
pdf_viewer_interface,
|
150 |
+
inputs=[pdf_state, gr.State(1)],
|
151 |
+
outputs=[pdf_viewer, page_number, pdf_page]
|
152 |
+
)
|
153 |
+
|
154 |
+
prev_button.click(
|
155 |
+
pdf_viewer_interface,
|
156 |
+
inputs=[pdf_state, page_number, gr.State("prev")],
|
157 |
+
outputs=[pdf_viewer, page_number, pdf_page]
|
158 |
+
)
|
159 |
+
|
160 |
+
next_button.click(
|
161 |
+
pdf_viewer_interface,
|
162 |
+
inputs=[pdf_state, page_number, gr.State("next")],
|
163 |
+
outputs=[pdf_viewer, page_number, pdf_page]
|
164 |
+
)
|
165 |
+
|
166 |
+
pdf_page.submit(
|
167 |
+
pdf_viewer_interface,
|
168 |
+
inputs=[pdf_state, page_number, gr.State(None), pdf_page],
|
169 |
+
outputs=[pdf_viewer, page_number, pdf_page]
|
170 |
+
)
|
171 |
+
|
172 |
+
chatbot.select(handle_source_click, [sources, pdf_state, page_number], [pdf_viewer, pdf_state, page_number, pdf_page])
|
173 |
+
|
174 |
+
if __name__ == "__main__":
|
175 |
+
demo.launch()
|
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b012862b5134a19825524db76e88f9ed7753467e36912b781a4eedcc5b79e59d
|
3 |
+
size 219940000
|
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aac1349cae78413d368be855bf1b1fe05c464a1466557a38049597c5ab1c2fd2
|
3 |
+
size 100
|
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:558d9768335c4e2df650ce68b66561af9b993d605444d9979dd431ddec869835
|
3 |
+
size 2028516
|
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:955616dc55327c057865719f90dae1fb9511e0350daef736668fe472a58313c9
|
3 |
+
size 140000
|
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bec79adf817e9c72e98c967480193b6a182a6973ead638c1607b88972c4f661f
|
3 |
+
size 303336
|
chroma_db/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82c709d4b055db6c031121a4019fd81ecaac321cae4bf7fb3117e09baef6d03f
|
3 |
+
size 518348800
|
documents/(최종본) 생성형 AI 저작권 안내서.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0c786f0992d3e227fef6471763d29fc14de622635568a5e7a46ea7bb02bc319
|
3 |
+
size 8063005
|
documents/1. (자료집)생성형 AI 교육자료-ChatGPT 사례 중심으로).pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b99c1c27309f641c2926e8c49c8aba295017622df2c2de614f5c64d600abedee
|
3 |
+
size 23096564
|
documents/docs1.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:85bd8d387d03fda0e5d1fc079cabdea2f85ec4bc6d928ed71ac272fc0b8d6c37
|
3 |
+
size 3476556
|
documents/docs2.pdf
ADDED
Binary file (134 kB). View file
|
|
documents/챗GPT등 생성형 AI 활용 보안 가이드라인.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65d0c5200f2153b0a02f8ed366038b7ad7d1db9aad4386df2eea41fcf0cc2726
|
3 |
+
size 2033387
|
get-pip.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pdf_viewer_component.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit.components.v1 as components
|
2 |
+
import os
|
3 |
+
import base64
|
4 |
+
|
5 |
+
_RELEASE = False
|
6 |
+
|
7 |
+
if not _RELEASE:
|
8 |
+
_component_func = components.declare_component(
|
9 |
+
"pdf_viewer",
|
10 |
+
url="http://localhost:3000",
|
11 |
+
)
|
12 |
+
else:
|
13 |
+
parent_dir = os.path.dirname(os.path.abspath(__file__))
|
14 |
+
build_dir = os.path.join(parent_dir, "frontend/build")
|
15 |
+
_component_func = components.declare_component("pdf_viewer", path=build_dir)
|
16 |
+
|
17 |
+
def pdf_viewer(pdf_base64, initial_page=1, key=None):
|
18 |
+
component_value = _component_func(pdfBase64=pdf_base64, initialPage=initial_page, key=key, default=initial_page)
|
19 |
+
return component_value
|
20 |
+
|
21 |
+
def load_pdf_as_base64(file_path):
|
22 |
+
with open(file_path, "rb") as pdf_file:
|
23 |
+
return base64.b64encode(pdf_file.read()).decode("utf-8")
|
rag_system.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from langchain_openai import OpenAIEmbeddings
|
4 |
+
from langchain_chroma import Chroma # 이 줄을 수정
|
5 |
+
from langchain.chains import ConversationalRetrievalChain
|
6 |
+
from langchain_openai import ChatOpenAI
|
7 |
+
from langchain.docstore.document import Document
|
8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
import pdfplumber
|
10 |
+
from concurrent.futures import ThreadPoolExecutor
|
11 |
+
|
12 |
+
# Load environment variables
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
# Set OpenAI API key
|
16 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
17 |
+
if not api_key:
|
18 |
+
raise ValueError("API key not found. Please set the OPENAI_API_KEY environment variable.")
|
19 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
20 |
+
|
21 |
+
def load_retrieval_qa_chain():
|
22 |
+
# Load embeddings
|
23 |
+
embeddings = OpenAIEmbeddings()
|
24 |
+
|
25 |
+
# Load vector store
|
26 |
+
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
|
27 |
+
|
28 |
+
# Initialize ChatOpenAI model
|
29 |
+
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) # "gpt-4o-mini
|
30 |
+
|
31 |
+
# Create ConversationalRetrievalChain
|
32 |
+
qa_chain = ConversationalRetrievalChain.from_llm(
|
33 |
+
llm,
|
34 |
+
vectorstore.as_retriever(),
|
35 |
+
return_source_documents=True
|
36 |
+
)
|
37 |
+
|
38 |
+
return qa_chain
|
39 |
+
|
40 |
+
def extract_text_from_pdf(file_path):
|
41 |
+
documents = []
|
42 |
+
with pdfplumber.open(file_path) as pdf:
|
43 |
+
for page_num, page in enumerate(pdf.pages):
|
44 |
+
text = page.extract_text()
|
45 |
+
if text:
|
46 |
+
# Split text into chunks
|
47 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
48 |
+
chunks = text_splitter.split_text(text)
|
49 |
+
for chunk in chunks:
|
50 |
+
doc = Document(page_content=chunk, metadata={"source": os.path.basename(file_path), "page": page_num + 1})
|
51 |
+
documents.append(doc)
|
52 |
+
return documents
|
53 |
+
|
54 |
+
def embed_documents():
|
55 |
+
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
|
56 |
+
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
|
57 |
+
|
58 |
+
pdf_files = [f for f in os.listdir("./documents") if f.endswith('.pdf')]
|
59 |
+
documents = []
|
60 |
+
with ThreadPoolExecutor() as executor:
|
61 |
+
results = executor.map(extract_text_from_pdf, [f"./documents/{pdf_file}" for pdf_file in pdf_files])
|
62 |
+
for result in results:
|
63 |
+
documents.extend(result)
|
64 |
+
vectorstore.add_documents(documents)
|
65 |
+
|
66 |
+
def update_embeddings():
|
67 |
+
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
|
68 |
+
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
|
69 |
+
|
70 |
+
# Retrieve existing documents
|
71 |
+
existing_files = set()
|
72 |
+
for doc in vectorstore.similarity_search(""):
|
73 |
+
existing_files.add(doc.metadata["source"])
|
74 |
+
|
75 |
+
pdf_files = [f for f in os.listdir("./documents") if f.endswith('.pdf')]
|
76 |
+
|
77 |
+
new_files = [f for f in pdf_files if f not in existing_files]
|
78 |
+
documents = []
|
79 |
+
with ThreadPoolExecutor() as executor:
|
80 |
+
results = executor.map(extract_text_from_pdf, [f"./documents/{pdf_file}" for pdf_file in new_files])
|
81 |
+
for result in results:
|
82 |
+
documents.extend(result)
|
83 |
+
vectorstore.add_documents(documents)
|
84 |
+
|
85 |
+
# Generate answer for a query
|
86 |
+
def get_answer(qa_chain, query, chat_history):
|
87 |
+
formatted_history = [(q, a) for q, a in zip(chat_history[::2], chat_history[1::2])]
|
88 |
+
|
89 |
+
response = qa_chain.invoke({"question": query, "chat_history": formatted_history})
|
90 |
+
|
91 |
+
answer = response["answer"]
|
92 |
+
|
93 |
+
source_docs = response.get("source_documents", [])
|
94 |
+
source_texts = [f"{os.path.basename(doc.metadata['source'])} (Page {doc.metadata['page']})" for doc in source_docs]
|
95 |
+
|
96 |
+
return {"answer": answer, "sources": source_texts}
|
97 |
+
|
98 |
+
# Example usage
|
99 |
+
if __name__ == "__main__":
|
100 |
+
update_embeddings() # Update embeddings with new documents
|
101 |
+
qa_chain = load_retrieval_qa_chain()
|
102 |
+
question = """당신은 RAG(Retrieval-Augmented Generation) 기반 AI 어시스턴트입니다. 다음 지침을 따라 사용자 질문에 답하세요:
|
103 |
+
|
104 |
+
1. 검색 결과 활용: 제공된 검색 결과를 분석하고 관련 정보를 사용해 답변하세요.
|
105 |
+
|
106 |
+
2. 정확성 유지: 정보의 정확성을 확인하고, 불확실한 경우 이를 명시하세요.
|
107 |
+
|
108 |
+
3. 간결한 응답: 질문에 직접 답하고 핵심 내용에 집중하세요.
|
109 |
+
|
110 |
+
4. 추가 정보 제안: 관련된 추가 정보가 있다면 언급하세요.
|
111 |
+
|
112 |
+
5. 윤리성 고려: 객관적이고 중립적인 태도를 유지하세요.
|
113 |
+
|
114 |
+
6. 한계 인정: 답변할 수 없는 경우 솔직히 인정하세요.
|
115 |
+
|
116 |
+
7. 대화 유지: 자연스럽게 대화를 이어가고, 필요시 후속 질문을 제안하세요.
|
117 |
+
항상 정확하고 유용한 정보를 제공하는 것을 목표로 하세요."""
|
118 |
+
|
119 |
+
response = get_answer(qa_chain, question, [])
|
120 |
+
print(f"Question: {question}")
|
121 |
+
print(f"Answer: {response['answer']}")
|
122 |
+
print(f"Sources: {response['sources']}")
|
requirements.txt
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cffi==1.17.0
|
2 |
+
charset_normalizer==3.3.2
|
3 |
+
constructor==0.1.0
|
4 |
+
Cython==3.0.11
|
5 |
+
cython==3.0.11
|
6 |
+
fitz==0.0.1.dev2
|
7 |
+
flint==0.0.1
|
8 |
+
gmpy2==2.2.1
|
9 |
+
gradio==4.42.0
|
10 |
+
hypothesis==6.111.2
|
11 |
+
ipython==8.12.3
|
12 |
+
langchain==0.2.15
|
13 |
+
langchain_chroma==0.1.3
|
14 |
+
langchain_openai==0.1.23
|
15 |
+
lll==0.0.1
|
16 |
+
mpmath==1.3.0
|
17 |
+
mtrand==0.1
|
18 |
+
mypy==1.11.2
|
19 |
+
numba==0.60.0
|
20 |
+
numeric==24.2
|
21 |
+
olefile==0.47
|
22 |
+
pdfplumber==0.5.28
|
23 |
+
pip==24.2
|
24 |
+
PyInstaller==6.10.0
|
25 |
+
PyJWT==2.9.0
|
26 |
+
PyPDF2==3.0.1
|
27 |
+
PyQt6==6.7.1
|
28 |
+
PySide6==6.7.2
|
29 |
+
pytest==8.3.2
|
30 |
+
python-dotenv==1.0.1
|
31 |
+
pytz==2024.1
|
32 |
+
setuptools==74.0.0
|
33 |
+
streamlit==1.38.0
|
34 |
+
threadpoolctl==3.5.0
|
35 |
+
typing_extensions==4.12.2
|