suchinth08 commited on
Commit
9a092af
·
verified ·
1 Parent(s): 76b8813

Upload 3 files

Browse files
Files changed (3) hide show
  1. lawimage2.jpg +0 -0
  2. lawmain.py +1 -1
  3. lawtrainmodel.py +50 -0
lawimage2.jpg ADDED
lawmain.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  from PIL import Image
3
- from lawchain import get_lpphelper_chain,process_llm_response
4
 
5
  #st.title( "Lakna Reddy & Associates 🤖")
6
  col1, mid, col2 = st.columns(3)
 
1
  import streamlit as st
2
  from PIL import Image
3
+ from lppchain import get_lpphelper_chain,process_llm_response
4
 
5
  #st.title( "Lakna Reddy & Associates 🤖")
6
  col1, mid, col2 = st.columns(3)
lawtrainmodel.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import torch
3
+ import os
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
+ from transformers import pipeline
6
+ from langchain.llms import HuggingFacePipeline
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.document_loaders import TextLoader
11
+ from langchain.document_loaders import PyPDFLoader
12
+ from langchain.document_loaders import DirectoryLoader
13
+ from InstructorEmbedding import INSTRUCTOR
14
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
15
+ from langchain_community.vectorstores import Chroma
16
+ import textwrap
17
+
18
+ def gen_vectordb():
19
+ tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0")
20
+ model = AutoModelForSeq2SeqLM.from_pretrained("lmsys/fastchat-t5-3b-v1.0")
21
+ pipe = pipeline(
22
+ "text2text-generation",
23
+ model=model,
24
+ tokenizer=tokenizer,
25
+ max_length=256
26
+ )
27
+
28
+ local_llm = HuggingFacePipeline(pipeline=pipe)
29
+ loader = DirectoryLoader('C:/Users/SudheerRChinthala/sivallm/new_papers', glob="./*.pdf", loader_cls=PyPDFLoader)
30
+ documents = loader.load()
31
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
32
+ texts = text_splitter.split_documents(documents)
33
+
34
+ instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
35
+ persist_directory = 'db'
36
+ embedding = instructor_embeddings
37
+ vectordb = Chroma.from_documents(documents=texts,
38
+ embedding=embedding,
39
+ persist_directory=persist_directory)
40
+ retriever = vectordb.as_retriever(search_kwargs={"k": 3})
41
+ qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
42
+ chain_type="stuff",
43
+ retriever=retriever,
44
+ return_source_documents=True)
45
+ vectordb.persist()
46
+ vectordb = None
47
+
48
+
49
+ if __name__=="__main__":
50
+ gen_vectordb()