Spaces:
Sleeping
Sleeping
import transformers | |
import torch | |
import os | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from transformers import pipeline | |
from langchain.llms import HuggingFacePipeline | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.chains import RetrievalQA | |
from langchain.document_loaders import TextLoader | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.document_loaders import DirectoryLoader | |
from InstructorEmbedding import INSTRUCTOR | |
from langchain.embeddings import HuggingFaceInstructEmbeddings | |
from langchain_community.vectorstores import Chroma | |
import textwrap | |
def gen_vectordb(): | |
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0") | |
model = AutoModelForSeq2SeqLM.from_pretrained("lmsys/fastchat-t5-3b-v1.0") | |
pipe = pipeline( | |
"text2text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_length=256 | |
) | |
local_llm = HuggingFacePipeline(pipeline=pipe) | |
loader = DirectoryLoader('./new_papers', glob="./*.pdf", loader_cls=PyPDFLoader) | |
documents = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
texts = text_splitter.split_documents(documents) | |
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base") | |
persist_directory = 'db' | |
embedding = instructor_embeddings | |
vectordb = Chroma.from_documents(documents=texts, | |
embedding=embedding, | |
persist_directory=persist_directory) | |
retriever = vectordb.as_retriever(search_kwargs={"k": 3}) | |
qa_chain = RetrievalQA.from_chain_type(llm=local_llm, | |
chain_type="stuff", | |
retriever=retriever, | |
return_source_documents=True) | |
vectordb.persist() | |
vectordb = None | |
if __name__=="__main__": | |
gen_vectordb() |