JUNGU commited on
Commit
aa8e01a
·
verified ·
1 Parent(s): ea88ab2

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ documents/(최종본)[[:space:]]생성형[[:space:]]AI[[:space:]]저작권[[:space:]]안내서.pdf filter=lfs diff=lfs merge=lfs -text
38
+ documents/1.[[:space:]](자료집)생성형[[:space:]]AI[[:space:]]교육자료-ChatGPT[[:space:]]사례[[:space:]]중심으로).pdf filter=lfs diff=lfs merge=lfs -text
39
+ documents/챗GPT등[[:space:]]생성형[[:space:]]AI[[:space:]]활용[[:space:]]보안[[:space:]]가이드라인.pdf filter=lfs diff=lfs merge=lfs -text
40
+ documents/docs1.pdf filter=lfs diff=lfs merge=lfs -text
__pycache__/pdf_viewer_component.cpython-312.pyc ADDED
Binary file (1.58 kB). View file
 
__pycache__/rag_system.cpython-312.pyc ADDED
Binary file (6.92 kB). View file
 
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from dotenv import load_dotenv
4
+ from rag_system import load_retrieval_qa_chain, get_answer, update_embeddings
5
+ import json
6
+ import re
7
+ from PyPDF2 import PdfReader
8
+ from PIL import Image
9
+ import io
10
+ from pydantic_settings import BaseSettings
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Set OpenAI API key
16
+ openai_api_key = os.getenv("OPENAI_API_KEY")
17
+ os.environ["OPENAI_API_KEY"] = openai_api_key
18
+
19
+ # Ensure the static directory exists
20
+ static_directory = "static"
21
+ if not os.path.exists(static_directory):
22
+ os.makedirs(static_directory)
23
+
24
+ # PDF utility functions
25
+ def get_pdf_page_count(file_path):
26
+ with open(file_path, 'rb') as file:
27
+ pdf = PdfReader(file)
28
+ return len(pdf.pages)
29
+
30
+ def render_pdf_page(file_path, page_num):
31
+ import fitz # PyMuPDF
32
+ doc = fitz.open(file_path)
33
+ page = doc.load_page(page_num - 1) # page numbers start from 0
34
+ pix = page.get_pixmap()
35
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
36
+ return img
37
+
38
+ # Load PDF data
39
+ def load_pdf_data():
40
+ pdf_data = {}
41
+ pdf_files = [f for f in os.listdir("./documents") if f.endswith('.pdf')]
42
+ for pdf_file in pdf_files:
43
+ file_path = f"./documents/{pdf_file}"
44
+ pdf_data[pdf_file] = {
45
+ 'path': file_path,
46
+ 'num_pages': get_pdf_page_count(file_path)
47
+ }
48
+ return pdf_data
49
+
50
+ # Update embeddings with new documents
51
+ update_embeddings()
52
+
53
+ # Load vector store and PDF data
54
+ qa_chain = load_retrieval_qa_chain()
55
+ pdf_data = load_pdf_data()
56
+
57
+ def pdf_viewer_interface(pdf_state, page_number, action=None, page_input=None):
58
+ selected_pdf = pdf_state['selected_pdf']
59
+ current_page = page_number
60
+ max_pages = pdf_data[selected_pdf]['num_pages']
61
+
62
+ if action == "prev":
63
+ current_page = max(1, current_page - 1)
64
+ elif action == "next":
65
+ current_page = min(max_pages, current_page + 1)
66
+ elif page_input is not None:
67
+ try:
68
+ current_page = int(page_input)
69
+ current_page = max(1, min(current_page, max_pages))
70
+ except ValueError:
71
+ pass
72
+
73
+ pdf_state['page_number'] = current_page
74
+ pdf_path = pdf_data[selected_pdf]['path']
75
+ img = render_pdf_page(pdf_path, current_page)
76
+ return img, current_page, str(current_page)
77
+
78
+ def chat_interface(user_input, chat_history, pdf_state):
79
+ chat_history_list = [item for sublist in chat_history for item in sublist]
80
+
81
+ response = get_answer(qa_chain, user_input, chat_history_list)
82
+ full_response = response["answer"]
83
+ sources = response["sources"]
84
+
85
+ chat_history.append((user_input, full_response))
86
+ return chat_history, sources
87
+
88
+ def handle_source_click(evt: gr.SelectData, sources, pdf_state, page_number):
89
+ index = evt.index[0] if isinstance(evt.index, list) else evt.index
90
+
91
+ if index >= len(sources):
92
+ return None, pdf_state, page_number, ""
93
+
94
+ source = sources[index]
95
+ file_name, page_str = source.split(' (Page ')
96
+ page_str = page_str.rstrip(')')
97
+ page = int(page_str)
98
+
99
+ if file_name not in pdf_data:
100
+ return None, pdf_state, page_number, ""
101
+
102
+ pdf_state['selected_pdf'] = file_name
103
+ pdf_state['page_number'] = page
104
+ pdf_path = pdf_data[file_name]['path']
105
+ img = render_pdf_page(pdf_path, page)
106
+ return img, pdf_state, page, str(page)
107
+
108
+ with gr.Blocks() as demo:
109
+ initial_pdf = list(pdf_data.keys())[0]
110
+ pdf_state = gr.State({'selected_pdf': initial_pdf, 'page_number': 1})
111
+ sources = gr.State([])
112
+ page_number = gr.State(1)
113
+
114
+ with gr.Row():
115
+ with gr.Column(scale=3):
116
+ chat_history = gr.State([])
117
+ chatbot = gr.Chatbot()
118
+ user_input = gr.Textbox(show_label=False, placeholder="Enter your question...")
119
+ source_list = gr.Dataframe(
120
+ headers=["Source", "Page"],
121
+ datatype=["str", "number"],
122
+ row_count=4,
123
+ col_count=2,
124
+ interactive=False,
125
+ label="Sources"
126
+ )
127
+
128
+ with gr.Column(scale=2):
129
+ pdf_dropdown = gr.Dropdown(choices=list(pdf_data.keys()), label="Select PDF", value=initial_pdf)
130
+ pdf_viewer = gr.Image(label="PDF Viewer", height=600)
131
+ pdf_page = gr.Number(label="Page Number", value=1)
132
+ with gr.Row():
133
+ prev_button = gr.Button("Previous Page")
134
+ next_button = gr.Button("Next Page")
135
+
136
+ user_input.submit(chat_interface, [user_input, chat_history, pdf_state], [chatbot, sources]).then(
137
+ lambda s: [[src.split(' (Page ')[0], int(src.split(' (Page ')[1].rstrip(')'))] for src in s],
138
+ inputs=[sources],
139
+ outputs=[source_list]
140
+ )
141
+
142
+ source_list.select(handle_source_click, [sources, pdf_state, page_number], [pdf_viewer, pdf_state, page_number, pdf_page])
143
+
144
+ pdf_dropdown.change(
145
+ lambda x: {'selected_pdf': x, 'page_number': 1},
146
+ inputs=[pdf_dropdown],
147
+ outputs=[pdf_state]
148
+ ).then(
149
+ pdf_viewer_interface,
150
+ inputs=[pdf_state, gr.State(1)],
151
+ outputs=[pdf_viewer, page_number, pdf_page]
152
+ )
153
+
154
+ prev_button.click(
155
+ pdf_viewer_interface,
156
+ inputs=[pdf_state, page_number, gr.State("prev")],
157
+ outputs=[pdf_viewer, page_number, pdf_page]
158
+ )
159
+
160
+ next_button.click(
161
+ pdf_viewer_interface,
162
+ inputs=[pdf_state, page_number, gr.State("next")],
163
+ outputs=[pdf_viewer, page_number, pdf_page]
164
+ )
165
+
166
+ pdf_page.submit(
167
+ pdf_viewer_interface,
168
+ inputs=[pdf_state, page_number, gr.State(None), pdf_page],
169
+ outputs=[pdf_viewer, page_number, pdf_page]
170
+ )
171
+
172
+ chatbot.select(handle_source_click, [sources, pdf_state, page_number], [pdf_viewer, pdf_state, page_number, pdf_page])
173
+
174
+ if __name__ == "__main__":
175
+ demo.launch()
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b012862b5134a19825524db76e88f9ed7753467e36912b781a4eedcc5b79e59d
3
+ size 219940000
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac1349cae78413d368be855bf1b1fe05c464a1466557a38049597c5ab1c2fd2
3
+ size 100
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:558d9768335c4e2df650ce68b66561af9b993d605444d9979dd431ddec869835
3
+ size 2028516
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:955616dc55327c057865719f90dae1fb9511e0350daef736668fe472a58313c9
3
+ size 140000
chroma_db/92bcea26-ebd2-4410-bf8d-bc0f9f5546b7/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bec79adf817e9c72e98c967480193b6a182a6973ead638c1607b88972c4f661f
3
+ size 303336
chroma_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82c709d4b055db6c031121a4019fd81ecaac321cae4bf7fb3117e09baef6d03f
3
+ size 518348800
documents/(최종본) 생성형 AI 저작권 안내서.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0c786f0992d3e227fef6471763d29fc14de622635568a5e7a46ea7bb02bc319
3
+ size 8063005
documents/1. (자료집)생성형 AI 교육자료-ChatGPT 사례 중심으로).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b99c1c27309f641c2926e8c49c8aba295017622df2c2de614f5c64d600abedee
3
+ size 23096564
documents/docs1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85bd8d387d03fda0e5d1fc079cabdea2f85ec4bc6d928ed71ac272fc0b8d6c37
3
+ size 3476556
documents/docs2.pdf ADDED
Binary file (134 kB). View file
 
documents/챗GPT등 생성형 AI 활용 보안 가이드라인.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65d0c5200f2153b0a02f8ed366038b7ad7d1db9aad4386df2eea41fcf0cc2726
3
+ size 2033387
get-pip.py ADDED
The diff for this file is too large to render. See raw diff
 
pdf_viewer_component.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit.components.v1 as components
2
+ import os
3
+ import base64
4
+
5
+ _RELEASE = False
6
+
7
+ if not _RELEASE:
8
+ _component_func = components.declare_component(
9
+ "pdf_viewer",
10
+ url="http://localhost:3000",
11
+ )
12
+ else:
13
+ parent_dir = os.path.dirname(os.path.abspath(__file__))
14
+ build_dir = os.path.join(parent_dir, "frontend/build")
15
+ _component_func = components.declare_component("pdf_viewer", path=build_dir)
16
+
17
+ def pdf_viewer(pdf_base64, initial_page=1, key=None):
18
+ component_value = _component_func(pdfBase64=pdf_base64, initialPage=initial_page, key=key, default=initial_page)
19
+ return component_value
20
+
21
+ def load_pdf_as_base64(file_path):
22
+ with open(file_path, "rb") as pdf_file:
23
+ return base64.b64encode(pdf_file.read()).decode("utf-8")
rag_system.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_openai import OpenAIEmbeddings
4
+ from langchain_chroma import Chroma # 이 줄을 수정
5
+ from langchain.chains import ConversationalRetrievalChain
6
+ from langchain_openai import ChatOpenAI
7
+ from langchain.docstore.document import Document
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ import pdfplumber
10
+ from concurrent.futures import ThreadPoolExecutor
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Set OpenAI API key
16
+ api_key = os.getenv("OPENAI_API_KEY")
17
+ if not api_key:
18
+ raise ValueError("API key not found. Please set the OPENAI_API_KEY environment variable.")
19
+ os.environ["OPENAI_API_KEY"] = api_key
20
+
21
+ def load_retrieval_qa_chain():
22
+ # Load embeddings
23
+ embeddings = OpenAIEmbeddings()
24
+
25
+ # Load vector store
26
+ vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
27
+
28
+ # Initialize ChatOpenAI model
29
+ llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) # "gpt-4o-mini
30
+
31
+ # Create ConversationalRetrievalChain
32
+ qa_chain = ConversationalRetrievalChain.from_llm(
33
+ llm,
34
+ vectorstore.as_retriever(),
35
+ return_source_documents=True
36
+ )
37
+
38
+ return qa_chain
39
+
40
+ def extract_text_from_pdf(file_path):
41
+ documents = []
42
+ with pdfplumber.open(file_path) as pdf:
43
+ for page_num, page in enumerate(pdf.pages):
44
+ text = page.extract_text()
45
+ if text:
46
+ # Split text into chunks
47
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
48
+ chunks = text_splitter.split_text(text)
49
+ for chunk in chunks:
50
+ doc = Document(page_content=chunk, metadata={"source": os.path.basename(file_path), "page": page_num + 1})
51
+ documents.append(doc)
52
+ return documents
53
+
54
+ def embed_documents():
55
+ embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
56
+ vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
57
+
58
+ pdf_files = [f for f in os.listdir("./documents") if f.endswith('.pdf')]
59
+ documents = []
60
+ with ThreadPoolExecutor() as executor:
61
+ results = executor.map(extract_text_from_pdf, [f"./documents/{pdf_file}" for pdf_file in pdf_files])
62
+ for result in results:
63
+ documents.extend(result)
64
+ vectorstore.add_documents(documents)
65
+
66
+ def update_embeddings():
67
+ embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
68
+ vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
69
+
70
+ # Retrieve existing documents
71
+ existing_files = set()
72
+ for doc in vectorstore.similarity_search(""):
73
+ existing_files.add(doc.metadata["source"])
74
+
75
+ pdf_files = [f for f in os.listdir("./documents") if f.endswith('.pdf')]
76
+
77
+ new_files = [f for f in pdf_files if f not in existing_files]
78
+ documents = []
79
+ with ThreadPoolExecutor() as executor:
80
+ results = executor.map(extract_text_from_pdf, [f"./documents/{pdf_file}" for pdf_file in new_files])
81
+ for result in results:
82
+ documents.extend(result)
83
+ vectorstore.add_documents(documents)
84
+
85
+ # Generate answer for a query
86
+ def get_answer(qa_chain, query, chat_history):
87
+ formatted_history = [(q, a) for q, a in zip(chat_history[::2], chat_history[1::2])]
88
+
89
+ response = qa_chain.invoke({"question": query, "chat_history": formatted_history})
90
+
91
+ answer = response["answer"]
92
+
93
+ source_docs = response.get("source_documents", [])
94
+ source_texts = [f"{os.path.basename(doc.metadata['source'])} (Page {doc.metadata['page']})" for doc in source_docs]
95
+
96
+ return {"answer": answer, "sources": source_texts}
97
+
98
+ # Example usage
99
+ if __name__ == "__main__":
100
+ update_embeddings() # Update embeddings with new documents
101
+ qa_chain = load_retrieval_qa_chain()
102
+ question = """당신은 RAG(Retrieval-Augmented Generation) 기반 AI 어시스턴트입니다. 다음 지침을 따라 사용자 질문에 답하세요:
103
+
104
+ 1. 검색 결과 활용: 제공된 검색 결과를 분석하고 관련 정보를 사용해 답변하세요.
105
+
106
+ 2. 정확성 유지: 정보의 정확성을 확인하고, 불확실한 경우 이를 명시하세요.
107
+
108
+ 3. 간결한 응답: 질문에 직접 답하고 핵심 내용에 집중하세요.
109
+
110
+ 4. 추가 정보 제안: 관련된 추가 정보가 있다면 언급하세요.
111
+
112
+ 5. 윤리성 고려: 객관적이고 중립적인 태도를 유지하세요.
113
+
114
+ 6. 한계 인정: 답변할 수 없는 경우 솔직히 인정하세요.
115
+
116
+ 7. 대화 유지: 자연스럽게 대화를 이어가고, 필요시 후속 질문을 제안하세요.
117
+ 항상 정확하고 유용한 정보를 제공하는 것을 목표로 하세요."""
118
+
119
+ response = get_answer(qa_chain, question, [])
120
+ print(f"Question: {question}")
121
+ print(f"Answer: {response['answer']}")
122
+ print(f"Sources: {response['sources']}")
requirements.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cffi==1.17.0
2
+ charset_normalizer==3.3.2
3
+ constructor==0.1.0
4
+ Cython==3.0.11
5
+ cython==3.0.11
6
+ fitz==0.0.1.dev2
7
+ flint==0.0.1
8
+ gmpy2==2.2.1
9
+ gradio==4.42.0
10
+ hypothesis==6.111.2
11
+ ipython==8.12.3
12
+ langchain==0.2.15
13
+ langchain_chroma==0.1.3
14
+ langchain_openai==0.1.23
15
+ lll==0.0.1
16
+ mpmath==1.3.0
17
+ mtrand==0.1
18
+ mypy==1.11.2
19
+ numba==0.60.0
20
+ numeric==24.2
21
+ olefile==0.47
22
+ pdfplumber==0.5.28
23
+ pip==24.2
24
+ PyInstaller==6.10.0
25
+ PyJWT==2.9.0
26
+ PyPDF2==3.0.1
27
+ PyQt6==6.7.1
28
+ PySide6==6.7.2
29
+ pytest==8.3.2
30
+ python-dotenv==1.0.1
31
+ pytz==2024.1
32
+ setuptools==74.0.0
33
+ streamlit==1.38.0
34
+ threadpoolctl==3.5.0
35
+ typing_extensions==4.12.2