|
import os |
|
import asyncio |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import FAISS |
|
from langchain_huggingface.embeddings import HuggingFaceEmbeddings |
|
|
|
async def process_and_store_file(file_path, user_id, websocket=None, upload_directory="./uploaded_files"): |
|
try: |
|
|
|
if websocket: |
|
await websocket.send_text("1. PDF νμΌ λ‘λ μ€...") |
|
loader = PyPDFLoader(file_path) |
|
documents = loader.load() |
|
if websocket: |
|
await websocket.send_text(f"PDF νμΌ λ‘λ μλ£: {len(documents)} λ¬Έμ") |
|
except Exception as e: |
|
if websocket: |
|
await websocket.send_text(f"PDF νμΌ λ‘λ μ€λ₯: {e}") |
|
return |
|
|
|
try: |
|
|
|
if websocket: |
|
await websocket.send_text("2. ν
μ€νΈ λΆν μ€...") |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
|
docs = text_splitter.split_documents(documents) |
|
if websocket: |
|
await websocket.send_text(f"ν
μ€νΈ λΆν μλ£: {len(docs)} μ²ν¬") |
|
except Exception as e: |
|
if websocket: |
|
await websocket.send_text(f"ν
μ€νΈ λΆν μ€λ₯: {e}") |
|
return |
|
|
|
try: |
|
|
|
if websocket: |
|
await websocket.send_text("3. μλ² λ© μμ± λ° λ²‘ν°ν μ€...") |
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
|
vectors = FAISS.from_documents(docs, embeddings) |
|
|
|
|
|
db_path = os.path.join(upload_directory, "faiss_index") |
|
vectors.save_local(db_path) |
|
if websocket: |
|
await websocket.send_text(f"FAISS μΈλ±μ€ μ μ₯ μλ£: {db_path}") |
|
except Exception as e: |
|
if websocket: |
|
await websocket.send_text(f"벑ν°ν μ€λ₯: {e}") |
|
return |
|
finally: |
|
|
|
try: |
|
if os.path.exists(file_path): |
|
os.remove(file_path) |
|
if websocket: |
|
await websocket.send_text(f"νμΌ μμ μλ£: {file_path}") |
|
except Exception as e: |
|
if websocket: |
|
await websocket.send_text(f"νμΌ μμ μ€λ₯: {e}") |
|
|