|
|
|
import os |
|
from langchain_qdrant import QdrantVectorStore |
|
import pickle |
|
from langchain_openai import OpenAIEmbeddings |
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
|
|
from langchain_community.vectorstores import Qdrant |
|
from langchain_core.documents import Document |
|
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings |
|
from langchain_groq import ChatGroq |
|
from langchain_qdrant import QdrantVectorStore |
|
from qdrant_client import QdrantClient |
|
from qdrant_client.http.models import Distance, VectorParams |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain_community.document_loaders import TextLoader |
|
from langchain.text_splitter import MarkdownHeaderTextSplitter |
|
from langchain.retrievers.self_query.base import SelfQueryRetriever |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain.document_loaders import TextLoader |
|
from langchain.docstore.document import Document |
|
from typing import List, Dict |
|
from langchain.chains import RetrievalQA |
|
import os |
|
|
|
llm = ChatGroq(model_name="llama3-70b-8192", temperature=0.1,api_key= os.getenv('llm_api_1')) |
|
|
|
|
|
def load_and_chunk_data(data_path): |
|
docs = [] |
|
|
|
for root, _, files in os.walk(data_path): |
|
for filename in files: |
|
if filename.endswith('.txt'): |
|
file_path = os.path.join(root, filename) |
|
loader = TextLoader(file_path, encoding='utf-8') |
|
docs.extend(loader.load()) |
|
|
|
headers_to_split_on = [ |
|
("#", "Header_1"), |
|
("##", "Header_2"), |
|
("###", "Header_3"), |
|
] |
|
|
|
markdown_splitter = MarkdownHeaderTextSplitter( |
|
headers_to_split_on=headers_to_split_on, strip_headers=False |
|
) |
|
|
|
chunk_size = 512 |
|
chunk_overlap = 0 |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap |
|
) |
|
chunked_docs = [] |
|
|
|
for doc in docs: |
|
md_header_splits = markdown_splitter.split_text(doc.page_content) |
|
chunked_docs.extend(text_splitter.split_documents(md_header_splits)) |
|
|
|
return chunked_docs |
|
|
|
|
|
|
|
data_path = '/home/azureuser/data/gioithieuhocvien' |
|
chunked_data = load_and_chunk_data(data_path) |
|
|
|
|
|
|
|
|
|
import pickle |
|
|
|
with open('gioithieuhocvien_filter.pkl', 'wb') as f: |
|
pickle.dump(chunked_data, f) |
|
|
|
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_KEY') |
|
HF_EMBEDDING = OpenAIEmbeddings(model='text-embedding-3-small') |
|
|
|
|
|
url="http://localhost:6333" |
|
qdrant = QdrantVectorStore.from_documents( |
|
chunked_data, |
|
HF_EMBEDDING, |
|
url=url, |
|
collection_name="gioithieuhocvien_filter", |
|
) |