Spaces:

tekville
/

demo

Sleeping

App Files Files Community

demo / app /modules /embedding.py

tekville

Initial commit

ff72db3 7 months ago

raw

history blame contribute delete

2.4 kB

	import os
	import asyncio
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import FAISS
	from langchain_huggingface.embeddings import HuggingFaceEmbeddings

	async def process_and_store_file(file_path, user_id, websocket=None, upload_directory="./uploaded_files"):
	try:
	# 1. PDF 파일 로드
	if websocket:
	await websocket.send_text("1. PDF 파일 로드 중...")
	loader = PyPDFLoader(file_path)
	documents = loader.load()
	if websocket:
	await websocket.send_text(f"PDF 파일 로드 완료: {len(documents)} 문서")
	except Exception as e:
	if websocket:
	await websocket.send_text(f"PDF 파일 로드 오류: {e}")
	return

	try:
	# 2. 텍스트 분할
	if websocket:
	await websocket.send_text("2. 텍스트 분할 중...")
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
	docs = text_splitter.split_documents(documents)
	if websocket:
	await websocket.send_text(f"텍스트 분할 완료: {len(docs)} 청크")
	except Exception as e:
	if websocket:
	await websocket.send_text(f"텍스트 분할 오류: {e}")
	return

	try:
	# 3. 임베딩 생성 및 벡터화
	if websocket:
	await websocket.send_text("3. 임베딩 생성 및 벡터화 중...")
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
	vectors = FAISS.from_documents(docs, embeddings)

	# 4. 벡터 저장
	db_path = os.path.join(upload_directory, "faiss_index")
	vectors.save_local(db_path)
	if websocket:
	await websocket.send_text(f"FAISS 인덱스 저장 완료: {db_path}")
	except Exception as e:
	if websocket:
	await websocket.send_text(f"벡터화 오류: {e}")
	return
	finally:
	# 5. 파일 삭제
	try:
	if os.path.exists(file_path):
	os.remove(file_path)
	if websocket:
	await websocket.send_text(f"파일 삭제 완료: {file_path}")
	except Exception as e:
	if websocket:
	await websocket.send_text(f"파일 삭제 오류: {e}")