suchinth08 commited on
Commit
bc13ac3
·
verified ·
1 Parent(s): 6a24062

Delete lpphelper.py

Browse files
Files changed (1) hide show
  1. lpphelper.py +0 -50
lpphelper.py DELETED
@@ -1,50 +0,0 @@
1
- import transformers
2
- import torch
3
- import os
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
- from transformers import pipeline
6
- from langchain.llms import HuggingFacePipeline
7
- from langchain.vectorstores import Chroma
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from langchain.chains import RetrievalQA
10
- from langchain.document_loaders import TextLoader
11
- from langchain.document_loaders import PyPDFLoader
12
- from langchain.document_loaders import DirectoryLoader
13
- from InstructorEmbedding import INSTRUCTOR
14
- from langchain.embeddings import HuggingFaceInstructEmbeddings
15
- from langchain_community.vectorstores import Chroma
16
- import textwrap
17
-
18
- def gen_vectordb():
19
- tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0")
20
- model = AutoModelForSeq2SeqLM.from_pretrained("lmsys/fastchat-t5-3b-v1.0")
21
- pipe = pipeline(
22
- "text2text-generation",
23
- model=model,
24
- tokenizer=tokenizer,
25
- max_length=256
26
- )
27
-
28
- local_llm = HuggingFacePipeline(pipeline=pipe)
29
- loader = DirectoryLoader('C:/Users/SudheerRChinthala/sivallm/new_papers', glob="./*.pdf", loader_cls=PyPDFLoader)
30
- documents = loader.load()
31
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
32
- texts = text_splitter.split_documents(documents)
33
-
34
- instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
35
- persist_directory = 'db'
36
- embedding = instructor_embeddings
37
- vectordb = Chroma.from_documents(documents=texts,
38
- embedding=embedding,
39
- persist_directory=persist_directory)
40
- retriever = vectordb.as_retriever(search_kwargs={"k": 3})
41
- qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
42
- chain_type="stuff",
43
- retriever=retriever,
44
- return_source_documents=True)
45
- vectordb.persist()
46
- vectordb = None
47
-
48
-
49
- if __name__=="__main__":
50
- gen_vectordb()