demo / app /modules /embedding.py
tekville's picture
Initial commit
ff72db3
import os
import asyncio
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
async def process_and_store_file(file_path, user_id, websocket=None, upload_directory="./uploaded_files"):
try:
# 1. PDF 파일 λ‘œλ“œ
if websocket:
await websocket.send_text("1. PDF 파일 λ‘œλ“œ 쀑...")
loader = PyPDFLoader(file_path)
documents = loader.load()
if websocket:
await websocket.send_text(f"PDF 파일 λ‘œλ“œ μ™„λ£Œ: {len(documents)} λ¬Έμ„œ")
except Exception as e:
if websocket:
await websocket.send_text(f"PDF 파일 λ‘œλ“œ 였λ₯˜: {e}")
return
try:
# 2. ν…μŠ€νŠΈ λΆ„ν• 
if websocket:
await websocket.send_text("2. ν…μŠ€νŠΈ λΆ„ν•  쀑...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
docs = text_splitter.split_documents(documents)
if websocket:
await websocket.send_text(f"ν…μŠ€νŠΈ λΆ„ν•  μ™„λ£Œ: {len(docs)} 청크")
except Exception as e:
if websocket:
await websocket.send_text(f"ν…μŠ€νŠΈ λΆ„ν•  였λ₯˜: {e}")
return
try:
# 3. μž„λ² λ”© 생성 및 벑터화
if websocket:
await websocket.send_text("3. μž„λ² λ”© 생성 및 벑터화 쀑...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
vectors = FAISS.from_documents(docs, embeddings)
# 4. 벑터 μ €μž₯
db_path = os.path.join(upload_directory, "faiss_index")
vectors.save_local(db_path)
if websocket:
await websocket.send_text(f"FAISS 인덱슀 μ €μž₯ μ™„λ£Œ: {db_path}")
except Exception as e:
if websocket:
await websocket.send_text(f"벑터화 였λ₯˜: {e}")
return
finally:
# 5. 파일 μ‚­μ œ
try:
if os.path.exists(file_path):
os.remove(file_path)
if websocket:
await websocket.send_text(f"파일 μ‚­μ œ μ™„λ£Œ: {file_path}")
except Exception as e:
if websocket:
await websocket.send_text(f"파일 μ‚­μ œ 였λ₯˜: {e}")