Graduation / pipelines /embedding.py
DuyTa's picture
Upload folder using huggingface_hub
74b1bac verified
import os
from langchain_qdrant import QdrantVectorStore
import pickle
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
from langchain_community.vectorstores import Qdrant
from langchain_core.documents import Document
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from typing import List, Dict
from langchain.chains import RetrievalQA
import os
llm = ChatGroq(model_name="llama3-70b-8192", temperature=0.1,api_key= os.getenv('llm_api_1'))
def load_and_chunk_data(data_path):
docs = []
# Load all .txt files from the specified folder and its subfolders
for root, _, files in os.walk(data_path):
for filename in files:
if filename.endswith('.txt'):
file_path = os.path.join(root, filename)
loader = TextLoader(file_path, encoding='utf-8')
docs.extend(loader.load())
headers_to_split_on = [
("#", "Header_1"),
("##", "Header_2"),
("###", "Header_3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on, strip_headers=False
)
chunk_size = 512
chunk_overlap = 0
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunked_docs = []
for doc in docs:
md_header_splits = markdown_splitter.split_text(doc.page_content)
chunked_docs.extend(text_splitter.split_documents(md_header_splits))
return chunked_docs
data_path = '/home/azureuser/data/gioithieuhocvien'
chunked_data = load_and_chunk_data(data_path)
# Save the documents list with pickle
import pickle
with open('gioithieuhocvien_filter.pkl', 'wb') as f:
pickle.dump(chunked_data, f)
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_KEY')
HF_EMBEDDING = OpenAIEmbeddings(model='text-embedding-3-small')
url="http://localhost:6333"
qdrant = QdrantVectorStore.from_documents(
chunked_data,
HF_EMBEDDING,
url=url,
collection_name="gioithieuhocvien_filter",
)